def _setup_datasets(url, top_n=-1, local_cache_path=".data"): FILE_NAME = "cnndm.tar.gz" maybe_download(url, FILE_NAME, local_cache_path) dataset_tar = os.path.join(local_cache_path, FILE_NAME) extracted_files = extract_archive(dataset_tar) for fname in extracted_files: if fname.endswith("train.txt.src"): train_source_file = fname if fname.endswith("train.txt.tgt.tagged"): train_target_file = fname if fname.endswith("test.txt.src"): test_source_file = fname if fname.endswith("test.txt.tgt.tagged"): test_target_file = fname return ( SummarizationDataset( train_source_file, train_target_file, [_clean, tokenize.sent_tokenize], [_clean, _remove_ttags, _target_sentence_tokenization], nltk.word_tokenize, top_n, ), SummarizationDataset( test_source_file, test_target_file, [_clean, tokenize.sent_tokenize], [_clean, _remove_ttags, _target_sentence_tokenization], nltk.word_tokenize, top_n, ), )
def test_dataset_for_bertsumabs(tmp_module): source = source_data() target = target_data() source_file = os.path.join(tmp_module, "source.txt") target_file = os.path.join(tmp_module, "target.txt") f = open(source_file, "w") for i in source: f.write(" ".join(i)) f.write("\n") f.close() f = open(target_file, "w") for i in target: f.write(" ".join(i)) f.write("\n") f.close() train_dataset = SummarizationDataset( source_file=source_file, target_file=target_file, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[tokenize.sent_tokenize], ) test_dataset = SummarizationDataset( source_file=source_file, target_file=target_file, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[tokenize.sent_tokenize], ) processor = BertSumAbsProcessor(cache_dir=tmp_module) batch = processor.collate(train_dataset, 512, "cuda:0") assert len(batch.src) == 3 return train_dataset, test_dataset
def data(tmp_module): source = source_data() target = target_data() train_dataset = SummarizationDataset( None, source=[source], target=[target], source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[tokenize.sent_tokenize], word_tokenize=nltk.word_tokenize, ) test_dataset = SummarizationDataset( None, source=[source], source_preprocessing=[tokenize.sent_tokenize], word_tokenize=nltk.word_tokenize, ) processor = ExtSumProcessor( model_name=MODEL_NAME, cache_dir=tmp_module, max_nsents=200, max_src_ntokens=2000, min_nsents=0, min_src_ntokens=1, ) ext_sum_train = processor.preprocess(train_dataset, oracle_mode="greedy") ext_sum_test = processor.preprocess(test_dataset, oracle_mode="greedy") return processor, ext_sum_train, ext_sum_test
def data_to_file(tmp_module): source = source_data() target = target_data() source_file = os.path.join(tmp_module, "source.txt") target_file = os.path.join(tmp_module, "target.txt") f = open(source_file, "w") f.write(source) f.close() f = open(target_file, "w") f.write(target) f.close() train_dataset = SummarizationDataset( source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize, ) test_dataset = SummarizationDataset( source_file, target_file, [tokenize.sent_tokenize], [tokenize.sent_tokenize], nltk.word_tokenize, ) processor = ExtSumProcessor( model_name=MODEL_NAME, cache_dir=tmp_module, max_nsents=200, max_src_ntokens=2000, min_nsents=0, min_src_ntokens=1, ) ext_sum_train = processor.preprocess(train_dataset, train_dataset.get_target(), oracle_mode="greedy") ext_sum_test = processor.preprocess(test_dataset, test_dataset.get_target(), oracle_mode="greedy") save_path = os.path.join(tmp_module, "processed") train_files = ExtSumProcessedData.save_data(ext_sum_train, is_test=False, save_path=save_path, chunk_size=2000) test_files = ExtSumProcessedData.save_data(ext_sum_test, is_test=True, save_path=save_path, chunk_size=2000) print(train_files) print(test_files) assert os.path.exists(train_files[0]) assert os.path.exists(test_files[0]) return save_path
def CNNDMSummarizationDatasetOrg(local_path=".", top_n=-1, return_iterable=False, return_dev_data=False): """ Downloads a version of the CNN/DailyMail dataset with minimal processing from https://github.com/microsoft/unilm/tree/master/unilm-v1 This version of the CNN/DM dataset was originally downloaded from https://github.com/harvardnlp/sent-summary and preprocessed following https://github.com/abisee/cnn-dailymail. Args: local_path (str): Path to store the downloaded data. If the data file doesn't exist in this path, it's downloaded and unzipped. top_n (int): Number of lines to read. Defaults to -1 and the entire dataset is read. return_iterable (bool): If False, returns SummarizationDataset. If True, returns IterableSummarizationDataset. Defaults to False. return_dev_data (bool): if False, returns train and test data splits. If True, returns train, test, and dev data splits. Defaults to False. Returns: tuple: tuple containing train, test (, and dev) datasets. """ # Download and unzip the data FILE_ID = "1jiDbDbAsqy_5BM79SmX6aSu5DQVCAZq1" FILE_NAME = "cnndm_data.zip" output_dir = os.path.join(local_path, "cnndm_data") os.makedirs(output_dir, exist_ok=True) # This folder contains the a version of the dataset with minimal processing org_data_dir = os.path.join(output_dir, "org_data") expected_data_files = set([ "train.src", "org_data", "dev.src", "test.tgt", "train.tgt", "dev.tgt", "test.src", ]) expected_org_data_files = set([ "training.summary", "test.article", "dev.article", "training.article", "dev.summary", "test.summary", ]) maybe_download_googledrive(google_file_id=FILE_ID, file_name=FILE_NAME, work_directory=local_path) if (set(os.listdir(output_dir)) != expected_data_files or set(os.listdir(org_data_dir)) != expected_org_data_files): extract_zip( file_path=os.path.join(local_path, FILE_NAME), dest_path=output_dir, ) train_source_file = os.path.join(org_data_dir, "training.article") train_target_file = os.path.join(org_data_dir, "training.summary") test_source_file = os.path.join(org_data_dir, "test.article") test_target_file = os.path.join(org_data_dir, "test.summary") dev_source_file = os.path.join(org_data_dir, "dev.article") dev_target_file = os.path.join(org_data_dir, "dev.summary") source_preprocessing = [detokenize] target_preprocessing = [detokenize] if return_iterable: train_dataset = IterableSummarizationDataset( source_file=train_source_file, target_file=train_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) test_dataset = IterableSummarizationDataset( source_file=test_source_file, target_file=test_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) else: train_dataset = SummarizationDataset( source_file=train_source_file, target_file=train_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) test_dataset = SummarizationDataset( source_file=test_source_file, target_file=test_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) if return_dev_data: if return_iterable: dev_dataset = IterableSummarizationDataset( source_file=dev_source_file, target_file=dev_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) else: dev_dataset = SummarizationDataset( source_file=dev_source_file, target_file=dev_target_file, source_preprocessing=source_preprocessing, target_preprocessing=target_preprocessing, top_n=top_n, ) return train_dataset, test_dataset, dev_dataset else: return train_dataset, test_dataset
def SwissSummarizationDataset(top_n=-1, validation=False): """Load the CNN/Daily Mail dataset preprocessed by harvardnlp group.""" URLS = [ "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_train.csv", "https://drive.switch.ch/index.php/s/YoyW9S8yml7wVhN/download?path=%2F&files=data_test.csv", ] LOCAL_CACHE_PATH = '.data' FILE_NAME = "data_train.csv" maybe_download(URLS[0], FILE_NAME, LOCAL_CACHE_PATH) dataset_path = os.path.join(LOCAL_CACHE_PATH, FILE_NAME) train = pandas.read_csv(dataset_path).values.tolist() if (top_n != -1): train = train[0:top_n] source = [item[0] for item in train] summary = [item[1] for item in train] train_source, test_source, train_summary, test_summary = train_test_split( source, summary, train_size=0.95, test_size=0.05, random_state=123) if validation: train_source, validation_source, train_summary, validation_summary = train_test_split( train_source, train_summary, train_size=0.9, test_size=0.1, random_state=123) return ( SummarizationDataset( source_file=None, source=train_source, target=train_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), SummarizationDataset( source_file=None, source=validation_source, target=validation_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), SummarizationDataset( source_file=None, source=test_source, target=test_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), ) else: return ( SummarizationDataset( source_file=None, source=train_source, target=train_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), SummarizationDataset( source_file=None, source=test_source, target=test_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), )
def test_S2SAbsSumProcessor(s2s_test_data, tmp): expected_output_length = 4 # prepare files for testing train_source_file = os.path.join(tmp, "train.src") train_target_file = os.path.join(tmp, "train.tgt") test_source_file = os.path.join(tmp, "test.src") train_json_file = os.path.join(tmp, "train.json") test_json_file = os.path.join(tmp, "test.json") with open(train_source_file, "w") as src_file, open(train_target_file, "w") as tgt_file: for item in s2s_test_data["train_ds"]: src_file.write(item["src"] + "\n") tgt_file.write(item["tgt"] + "\n") with open(test_source_file, "w") as src_file: for item in s2s_test_data["test_ds"]: src_file.write(item["src"] + "\n") train_iterable_sum_ds = IterableSummarizationDataset( source_file=train_source_file, target_file=train_target_file) test_iterable_sum_ds = IterableSummarizationDataset( source_file=test_source_file) train_sum_ds = SummarizationDataset(source_file=train_source_file, target_file=train_target_file) test_sum_ds = SummarizationDataset(source_file=test_source_file) train_sum_ds.save_to_jsonl(train_json_file) test_sum_ds.save_to_jsonl(test_json_file) processor = S2SAbsSumProcessor(cache_dir=tmp) train_json_output = processor.s2s_dataset_from_json_or_file( input_data=s2s_test_data["train_ds"], train_mode=True) test_json_output = processor.s2s_dataset_from_json_or_file( input_data=s2s_test_data["test_ds"], train_mode=False) assert len(train_json_output) == expected_output_length assert len(test_json_output) == expected_output_length train_file_output = processor.s2s_dataset_from_json_or_file( input_data=train_json_file, train_mode=True) test_file_output = processor.s2s_dataset_from_json_or_file( input_data=test_json_file, train_mode=False) assert len(train_file_output) == expected_output_length assert len(test_file_output) == expected_output_length train_iterable_sum_ds_output = processor.s2s_dataset_from_iterable_sum_ds( sum_ds=train_iterable_sum_ds, train_mode=True) test_iterable_sum_ds_output = processor.s2s_dataset_from_iterable_sum_ds( sum_ds=test_iterable_sum_ds, train_mode=False) assert len(train_iterable_sum_ds_output) == expected_output_length assert len(test_iterable_sum_ds_output) == expected_output_length train_sum_ds_output = processor.s2s_dataset_from_sum_ds( sum_ds=train_sum_ds, train_mode=True) test_sum_ds_output = processor.s2s_dataset_from_sum_ds(sum_ds=test_sum_ds, train_mode=False) assert len(train_sum_ds_output) == expected_output_length assert len(test_sum_ds_output) == expected_output_length
def BundesSummarizationDataset(top_n=-1, validation=False, prepare_extractive=True, language='german', CSV_PATH=None): """Load the bundes dataset by faktual.""" if CSV_PATH is None: CSV_PATH = '/home/ubuntu/mnt/data/bundes_dataset/csv/bundes_data.csv' # FILE_NAME = "bundes_data.csv" train = pandas.read_csv(CSV_PATH).values.tolist() if (top_n != -1): train = train[0:top_n] source = [str(item[0]) for item in train] summary = [str(item[1]) for item in train] print("source[0]: ", source[0]) print("summary[0]: ", summary[0]) train_source, test_source, train_summary, test_summary = train_test_split( source, summary, train_size=0.95, test_size=0.05, random_state=123) if prepare_extractive: if validation: train_source, validation_source, train_summary, validation_summary = train_test_split( train_source, train_summary, train_size=0.9, test_size=0.1, random_state=123) return ( SummarizationDataset( source_file=None, source=train_source, target=train_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ _clean, _remove_ttags, _target_sentence_tokenization, ], word_tokenize=nltk.word_tokenize, top_n=top_n, language=language, ), SummarizationDataset( source_file=None, source=validation_source, target=validation_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ _clean, _remove_ttags, _target_sentence_tokenization, ], word_tokenize=nltk.word_tokenize, top_n=top_n, language=language, ), SummarizationDataset( source_file=None, source=test_source, target=test_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ _clean, _remove_ttags, _target_sentence_tokenization, ], word_tokenize=nltk.word_tokenize, top_n=top_n, language=language, ), ) else: return ( SummarizationDataset( source_file=None, source=train_source, target=train_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ _clean, _remove_ttags, _target_sentence_tokenization, ], word_tokenize=nltk.word_tokenize, top_n=top_n, language=language, ), SummarizationDataset( source_file=None, source=test_source, target=test_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ _clean, _remove_ttags, _target_sentence_tokenization, ], word_tokenize=nltk.word_tokenize, top_n=top_n, language=language, ), ) else: if validation: train_source, validation_source, train_summary, validation_summary = train_test_split( train_source, train_summary, train_size=0.9, test_size=0.1, random_state=123) return ( SummarizationDataset( source_file=None, source=train_source, target=train_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), SummarizationDataset( source_file=None, source=validation_source, target=validation_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), SummarizationDataset( source_file=None, source=test_source, target=test_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), ) else: return ( SummarizationDataset( source_file=None, source=train_source, target=train_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), SummarizationDataset( source_file=None, source=test_source, target=test_summary, source_preprocessing=[tokenize.sent_tokenize], target_preprocessing=[ tokenize.sent_tokenize, ], top_n=top_n, ), )