예제 #1
0
    def _setup_datasets(url, top_n=-1, local_cache_path=".data"):
        FILE_NAME = "cnndm.tar.gz"
        maybe_download(url, FILE_NAME, local_cache_path)
        dataset_tar = os.path.join(local_cache_path, FILE_NAME)
        extracted_files = extract_archive(dataset_tar)
        for fname in extracted_files:
            if fname.endswith("train.txt.src"):
                train_source_file = fname
            if fname.endswith("train.txt.tgt.tagged"):
                train_target_file = fname
            if fname.endswith("test.txt.src"):
                test_source_file = fname
            if fname.endswith("test.txt.tgt.tagged"):
                test_target_file = fname

        return (
            SummarizationDataset(
                train_source_file,
                train_target_file,
                [_clean, tokenize.sent_tokenize],
                [_clean, _remove_ttags, _target_sentence_tokenization],
                nltk.word_tokenize,
                top_n,
            ),
            SummarizationDataset(
                test_source_file,
                test_target_file,
                [_clean, tokenize.sent_tokenize],
                [_clean, _remove_ttags, _target_sentence_tokenization],
                nltk.word_tokenize,
                top_n,
            ),
        )
예제 #2
0
def test_dataset_for_bertsumabs(tmp_module):
    source = source_data()
    target = target_data()
    source_file = os.path.join(tmp_module, "source.txt")
    target_file = os.path.join(tmp_module, "target.txt")
    f = open(source_file, "w")
    for i in source:
        f.write(" ".join(i))
        f.write("\n")
    f.close()
    f = open(target_file, "w")
    for i in target:
        f.write(" ".join(i))
        f.write("\n")
    f.close()
    train_dataset = SummarizationDataset(
        source_file=source_file,
        target_file=target_file,
        source_preprocessing=[tokenize.sent_tokenize],
        target_preprocessing=[tokenize.sent_tokenize],
    )
    test_dataset = SummarizationDataset(
        source_file=source_file,
        target_file=target_file,
        source_preprocessing=[tokenize.sent_tokenize],
        target_preprocessing=[tokenize.sent_tokenize],
    )
    processor = BertSumAbsProcessor(cache_dir=tmp_module)
    batch = processor.collate(train_dataset, 512, "cuda:0")
    assert len(batch.src) == 3
    return train_dataset, test_dataset
def data(tmp_module):
    source = source_data()
    target = target_data()
    train_dataset = SummarizationDataset(
        None,
        source=[source],
        target=[target],
        source_preprocessing=[tokenize.sent_tokenize],
        target_preprocessing=[tokenize.sent_tokenize],
        word_tokenize=nltk.word_tokenize,
    )
    test_dataset = SummarizationDataset(
        None,
        source=[source],
        source_preprocessing=[tokenize.sent_tokenize],
        word_tokenize=nltk.word_tokenize,
    )

    processor = ExtSumProcessor(
        model_name=MODEL_NAME,
        cache_dir=tmp_module,
        max_nsents=200,
        max_src_ntokens=2000,
        min_nsents=0,
        min_src_ntokens=1,
    )
    ext_sum_train = processor.preprocess(train_dataset, oracle_mode="greedy")
    ext_sum_test = processor.preprocess(test_dataset, oracle_mode="greedy")
    return processor, ext_sum_train, ext_sum_test
예제 #4
0
def data_to_file(tmp_module):
    source = source_data()
    target = target_data()
    source_file = os.path.join(tmp_module, "source.txt")
    target_file = os.path.join(tmp_module, "target.txt")
    f = open(source_file, "w")
    f.write(source)
    f.close()
    f = open(target_file, "w")
    f.write(target)
    f.close()
    train_dataset = SummarizationDataset(
        source_file,
        target_file,
        [tokenize.sent_tokenize],
        [tokenize.sent_tokenize],
        nltk.word_tokenize,
    )
    test_dataset = SummarizationDataset(
        source_file,
        target_file,
        [tokenize.sent_tokenize],
        [tokenize.sent_tokenize],
        nltk.word_tokenize,
    )

    processor = ExtSumProcessor(
        model_name=MODEL_NAME,
        cache_dir=tmp_module,
        max_nsents=200,
        max_src_ntokens=2000,
        min_nsents=0,
        min_src_ntokens=1,
    )
    ext_sum_train = processor.preprocess(train_dataset,
                                         train_dataset.get_target(),
                                         oracle_mode="greedy")
    ext_sum_test = processor.preprocess(test_dataset,
                                        test_dataset.get_target(),
                                        oracle_mode="greedy")

    save_path = os.path.join(tmp_module, "processed")
    train_files = ExtSumProcessedData.save_data(ext_sum_train,
                                                is_test=False,
                                                save_path=save_path,
                                                chunk_size=2000)
    test_files = ExtSumProcessedData.save_data(ext_sum_test,
                                               is_test=True,
                                               save_path=save_path,
                                               chunk_size=2000)
    print(train_files)
    print(test_files)
    assert os.path.exists(train_files[0])
    assert os.path.exists(test_files[0])
    return save_path
예제 #5
0
def CNNDMSummarizationDatasetOrg(local_path=".",
                                 top_n=-1,
                                 return_iterable=False,
                                 return_dev_data=False):
    """
    Downloads a version of the CNN/DailyMail dataset with minimal processing
    from https://github.com/microsoft/unilm/tree/master/unilm-v1
    This version of the CNN/DM dataset was originally downloaded from
    https://github.com/harvardnlp/sent-summary
    and preprocessed following https://github.com/abisee/cnn-dailymail.

    Args:
        local_path (str): Path to store the downloaded data. If the data file
            doesn't exist in this path, it's downloaded and unzipped.
        top_n (int): Number of lines to read. Defaults to -1 and the entire dataset
            is read.
        return_iterable (bool): If False, returns SummarizationDataset.
            If True, returns IterableSummarizationDataset. Defaults to False.
        return_dev_data (bool): if False, returns train and test data splits.
            If True, returns train, test, and dev data splits. Defaults to False.

    Returns:
        tuple: tuple containing train, test (, and dev) datasets.
    """

    # Download and unzip the data
    FILE_ID = "1jiDbDbAsqy_5BM79SmX6aSu5DQVCAZq1"
    FILE_NAME = "cnndm_data.zip"

    output_dir = os.path.join(local_path, "cnndm_data")
    os.makedirs(output_dir, exist_ok=True)

    # This folder contains the a version of the dataset with minimal processing
    org_data_dir = os.path.join(output_dir, "org_data")

    expected_data_files = set([
        "train.src",
        "org_data",
        "dev.src",
        "test.tgt",
        "train.tgt",
        "dev.tgt",
        "test.src",
    ])
    expected_org_data_files = set([
        "training.summary",
        "test.article",
        "dev.article",
        "training.article",
        "dev.summary",
        "test.summary",
    ])

    maybe_download_googledrive(google_file_id=FILE_ID,
                               file_name=FILE_NAME,
                               work_directory=local_path)

    if (set(os.listdir(output_dir)) != expected_data_files
            or set(os.listdir(org_data_dir)) != expected_org_data_files):
        extract_zip(
            file_path=os.path.join(local_path, FILE_NAME),
            dest_path=output_dir,
        )

    train_source_file = os.path.join(org_data_dir, "training.article")
    train_target_file = os.path.join(org_data_dir, "training.summary")
    test_source_file = os.path.join(org_data_dir, "test.article")
    test_target_file = os.path.join(org_data_dir, "test.summary")
    dev_source_file = os.path.join(org_data_dir, "dev.article")
    dev_target_file = os.path.join(org_data_dir, "dev.summary")

    source_preprocessing = [detokenize]
    target_preprocessing = [detokenize]

    if return_iterable:
        train_dataset = IterableSummarizationDataset(
            source_file=train_source_file,
            target_file=train_target_file,
            source_preprocessing=source_preprocessing,
            target_preprocessing=target_preprocessing,
            top_n=top_n,
        )

        test_dataset = IterableSummarizationDataset(
            source_file=test_source_file,
            target_file=test_target_file,
            source_preprocessing=source_preprocessing,
            target_preprocessing=target_preprocessing,
            top_n=top_n,
        )
    else:
        train_dataset = SummarizationDataset(
            source_file=train_source_file,
            target_file=train_target_file,
            source_preprocessing=source_preprocessing,
            target_preprocessing=target_preprocessing,
            top_n=top_n,
        )

        test_dataset = SummarizationDataset(
            source_file=test_source_file,
            target_file=test_target_file,
            source_preprocessing=source_preprocessing,
            target_preprocessing=target_preprocessing,
            top_n=top_n,
        )

    if return_dev_data:
        if return_iterable:
            dev_dataset = IterableSummarizationDataset(
                source_file=dev_source_file,
                target_file=dev_target_file,
                source_preprocessing=source_preprocessing,
                target_preprocessing=target_preprocessing,
                top_n=top_n,
            )
        else:
            dev_dataset = SummarizationDataset(
                source_file=dev_source_file,
                target_file=dev_target_file,
                source_preprocessing=source_preprocessing,
                target_preprocessing=target_preprocessing,
                top_n=top_n,
            )

        return train_dataset, test_dataset, dev_dataset
    else:
        return train_dataset, test_dataset
예제 #6
0
def SwissSummarizationDataset(top_n=-1, validation=False):
    """Load the CNN/Daily Mail dataset preprocessed by harvardnlp group."""

    URLS = [
        "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_train.csv",
        "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_test.csv",
    ]
    LOCAL_CACHE_PATH = '.data'

    FILE_NAME = "data_train.csv"
    maybe_download(URLS[0], FILE_NAME, LOCAL_CACHE_PATH)
    dataset_path = os.path.join(LOCAL_CACHE_PATH, FILE_NAME)

    train = pandas.read_csv(dataset_path).values.tolist()
    if (top_n != -1):
        train = train[0:top_n]
    source = [item[0] for item in train]
    summary = [item[1] for item in train]
    train_source, test_source, train_summary, test_summary = train_test_split(
        source, summary, train_size=0.95, test_size=0.05, random_state=123)
    if validation:
        train_source, validation_source, train_summary, validation_summary = train_test_split(
            train_source,
            train_summary,
            train_size=0.9,
            test_size=0.1,
            random_state=123)
        return (
            SummarizationDataset(
                source_file=None,
                source=train_source,
                target=train_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
            SummarizationDataset(
                source_file=None,
                source=validation_source,
                target=validation_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
            SummarizationDataset(
                source_file=None,
                source=test_source,
                target=test_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
        )
    else:
        return (
            SummarizationDataset(
                source_file=None,
                source=train_source,
                target=train_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
            SummarizationDataset(
                source_file=None,
                source=test_source,
                target=test_summary,
                source_preprocessing=[tokenize.sent_tokenize],
                target_preprocessing=[
                    tokenize.sent_tokenize,
                ],
                top_n=top_n,
            ),
        )
def test_S2SAbsSumProcessor(s2s_test_data, tmp):
    expected_output_length = 4
    # prepare files for testing
    train_source_file = os.path.join(tmp, "train.src")
    train_target_file = os.path.join(tmp, "train.tgt")

    test_source_file = os.path.join(tmp, "test.src")

    train_json_file = os.path.join(tmp, "train.json")
    test_json_file = os.path.join(tmp, "test.json")

    with open(train_source_file,
              "w") as src_file, open(train_target_file, "w") as tgt_file:
        for item in s2s_test_data["train_ds"]:
            src_file.write(item["src"] + "\n")
            tgt_file.write(item["tgt"] + "\n")

    with open(test_source_file, "w") as src_file:
        for item in s2s_test_data["test_ds"]:
            src_file.write(item["src"] + "\n")

    train_iterable_sum_ds = IterableSummarizationDataset(
        source_file=train_source_file, target_file=train_target_file)
    test_iterable_sum_ds = IterableSummarizationDataset(
        source_file=test_source_file)

    train_sum_ds = SummarizationDataset(source_file=train_source_file,
                                        target_file=train_target_file)
    test_sum_ds = SummarizationDataset(source_file=test_source_file)

    train_sum_ds.save_to_jsonl(train_json_file)
    test_sum_ds.save_to_jsonl(test_json_file)

    processor = S2SAbsSumProcessor(cache_dir=tmp)

    train_json_output = processor.s2s_dataset_from_json_or_file(
        input_data=s2s_test_data["train_ds"], train_mode=True)
    test_json_output = processor.s2s_dataset_from_json_or_file(
        input_data=s2s_test_data["test_ds"], train_mode=False)

    assert len(train_json_output) == expected_output_length
    assert len(test_json_output) == expected_output_length

    train_file_output = processor.s2s_dataset_from_json_or_file(
        input_data=train_json_file, train_mode=True)
    test_file_output = processor.s2s_dataset_from_json_or_file(
        input_data=test_json_file, train_mode=False)

    assert len(train_file_output) == expected_output_length
    assert len(test_file_output) == expected_output_length

    train_iterable_sum_ds_output = processor.s2s_dataset_from_iterable_sum_ds(
        sum_ds=train_iterable_sum_ds, train_mode=True)
    test_iterable_sum_ds_output = processor.s2s_dataset_from_iterable_sum_ds(
        sum_ds=test_iterable_sum_ds, train_mode=False)

    assert len(train_iterable_sum_ds_output) == expected_output_length
    assert len(test_iterable_sum_ds_output) == expected_output_length

    train_sum_ds_output = processor.s2s_dataset_from_sum_ds(
        sum_ds=train_sum_ds, train_mode=True)
    test_sum_ds_output = processor.s2s_dataset_from_sum_ds(sum_ds=test_sum_ds,
                                                           train_mode=False)

    assert len(train_sum_ds_output) == expected_output_length
    assert len(test_sum_ds_output) == expected_output_length
예제 #8
0
def BundesSummarizationDataset(top_n=-1,
                               validation=False,
                               prepare_extractive=True,
                               language='german',
                               CSV_PATH=None):
    """Load the bundes dataset by faktual."""

    if CSV_PATH is None:
        CSV_PATH = '/home/ubuntu/mnt/data/bundes_dataset/csv/bundes_data.csv'
        # FILE_NAME = "bundes_data.csv"

    train = pandas.read_csv(CSV_PATH).values.tolist()
    if (top_n != -1):
        train = train[0:top_n]
    source = [str(item[0]) for item in train]
    summary = [str(item[1]) for item in train]

    print("source[0]: ", source[0])
    print("summary[0]: ", summary[0])
    train_source, test_source, train_summary, test_summary = train_test_split(
        source, summary, train_size=0.95, test_size=0.05, random_state=123)

    if prepare_extractive:
        if validation:
            train_source, validation_source, train_summary, validation_summary = train_test_split(
                train_source,
                train_summary,
                train_size=0.9,
                test_size=0.1,
                random_state=123)
            return (
                SummarizationDataset(
                    source_file=None,
                    source=train_source,
                    target=train_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        _clean,
                        _remove_ttags,
                        _target_sentence_tokenization,
                    ],
                    word_tokenize=nltk.word_tokenize,
                    top_n=top_n,
                    language=language,
                ),
                SummarizationDataset(
                    source_file=None,
                    source=validation_source,
                    target=validation_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        _clean,
                        _remove_ttags,
                        _target_sentence_tokenization,
                    ],
                    word_tokenize=nltk.word_tokenize,
                    top_n=top_n,
                    language=language,
                ),
                SummarizationDataset(
                    source_file=None,
                    source=test_source,
                    target=test_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        _clean,
                        _remove_ttags,
                        _target_sentence_tokenization,
                    ],
                    word_tokenize=nltk.word_tokenize,
                    top_n=top_n,
                    language=language,
                ),
            )
        else:
            return (
                SummarizationDataset(
                    source_file=None,
                    source=train_source,
                    target=train_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        _clean,
                        _remove_ttags,
                        _target_sentence_tokenization,
                    ],
                    word_tokenize=nltk.word_tokenize,
                    top_n=top_n,
                    language=language,
                ),
                SummarizationDataset(
                    source_file=None,
                    source=test_source,
                    target=test_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        _clean,
                        _remove_ttags,
                        _target_sentence_tokenization,
                    ],
                    word_tokenize=nltk.word_tokenize,
                    top_n=top_n,
                    language=language,
                ),
            )
    else:
        if validation:
            train_source, validation_source, train_summary, validation_summary = train_test_split(
                train_source,
                train_summary,
                train_size=0.9,
                test_size=0.1,
                random_state=123)
            return (
                SummarizationDataset(
                    source_file=None,
                    source=train_source,
                    target=train_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        tokenize.sent_tokenize,
                    ],
                    top_n=top_n,
                ),
                SummarizationDataset(
                    source_file=None,
                    source=validation_source,
                    target=validation_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        tokenize.sent_tokenize,
                    ],
                    top_n=top_n,
                ),
                SummarizationDataset(
                    source_file=None,
                    source=test_source,
                    target=test_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        tokenize.sent_tokenize,
                    ],
                    top_n=top_n,
                ),
            )
        else:
            return (
                SummarizationDataset(
                    source_file=None,
                    source=train_source,
                    target=train_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        tokenize.sent_tokenize,
                    ],
                    top_n=top_n,
                ),
                SummarizationDataset(
                    source_file=None,
                    source=test_source,
                    target=test_summary,
                    source_preprocessing=[tokenize.sent_tokenize],
                    target_preprocessing=[
                        tokenize.sent_tokenize,
                    ],
                    top_n=top_n,
                ),
            )