def run(self, args): self._notify_about_license() # Download the dataset so each individual setup does not need # to repeat this. The dataset comes from # http://multiling.iit.demokritos.gr/file/view/353/tac-2011-multiling-pilot-dataset-all-files-source-texts-human-and-system-summaries-evaluation-data data_path = f'{args.output_dir}/PublishedCorpusFileSet.zip' download_url_to_file( 'http://multiling.iit.demokritos.gr/file/download/353', data_path) task.setup(data_path, args.output_dir) metrics.setup(data_path, args.output_dir)
def download_system_outputs(output_dir: str, force: bool) -> str: expanded_dir = f'{output_dir}/expanded/' for system_id in range(0, 24): url = f'https://storage.googleapis.com/sfr-summarization-repo-research/M{system_id}.tar.gz' file_path = f'{output_dir}/M{system_id}.tar.gz' download_url_to_file(url, file_path, force) model_expanded_dir = f'{expanded_dir}/M{system_id}' if os.path.exists(model_expanded_dir) and force: shutil.rmtree(model_expanded_dir) if not os.path.exists(model_expanded_dir): with tarfile.open(file_path, 'r') as tar: tar.extractall(expanded_dir) return expanded_dir
def setup(output_dir: str, force: bool = False) -> None: download_raw_data(output_dir, force) abs_raw_data, ext_raw_data = load_all_data(output_dir) download_url_to_file( 'https://github.com/neulab/REALSumm/raw/master/scores_dicts/abs.pkl', f'{output_dir}/raw/abs.pkl', force=force) download_url_to_file( 'https://github.com/neulab/REALSumm/raw/master/scores_dicts/ext.pkl', f'{output_dir}/raw/ext.pkl', force=force) abstractive = pickle.load(open(f'{output_dir}/raw/abs.pkl', 'rb')) extractive = pickle.load(open(f'{output_dir}/raw/ext.pkl', 'rb')) abs_filtered = filter_to_scored_summaries(abs_raw_data, abstractive) ext_filtered = filter_to_scored_summaries(ext_raw_data, extractive) abs_instances, abs_metrics_list = convert_to_sacrerouge_instances_and_metrics( abstractive, abs_filtered, 'abs') ext_instances, ext_metrics_list = convert_to_sacrerouge_instances_and_metrics( extractive, ext_filtered, 'ext') save_to_jsonl(abs_instances, f'{output_dir}/summaries-abs.jsonl') save_to_jsonl(ext_instances, f'{output_dir}/summaries-ext.jsonl') save_to_jsonl(abs_instances + ext_instances, f'{output_dir}/summaries-mix.jsonl') save_to_jsonl(abs_metrics_list, f'{output_dir}/metrics-abs.jsonl') save_to_jsonl(ext_metrics_list, f'{output_dir}/metrics-ext.jsonl') save_to_jsonl(abs_metrics_list + ext_metrics_list, f'{output_dir}/metrics-mix.jsonl') all_abs = collect_all_outputs(abs_raw_data) all_ext = collect_all_outputs(ext_raw_data) all_mix = collect_all_outputs({**abs_raw_data, **ext_raw_data}) save_to_jsonl(all_abs, f'{output_dir}/all-summaries-abs.jsonl.gz') save_to_jsonl(all_ext, f'{output_dir}/all-summaries-ext.jsonl.gz') save_to_jsonl(all_mix, f'{output_dir}/all-summaries-mix.jsonl.gz') print_stats(all_mix)
def _download_ids(output_dir: str, force: bool) -> None: download_url_to_file( 'https://raw.githubusercontent.com/Alex-Fabbri/Multi-News/master/data/ids/train.id', f'{output_dir}/train.id', force=force) download_url_to_file( 'https://raw.githubusercontent.com/Alex-Fabbri/Multi-News/master/data/ids/val.id', f'{output_dir}/val.id', force=force) download_url_to_file( 'https://raw.githubusercontent.com/Alex-Fabbri/Multi-News/master/data/ids/test.id', f'{output_dir}/test.id', force=force)
def _download_raw_data(output_dir: str, force: bool) -> None: download_url_to_file( 'https://github.com/PrimerAI/blanc/raw/master/data/CNN_DailyMail_555.json', f'{output_dir}/raw/CNN_DailyMail_555.json', force=force) download_url_to_file( 'https://github.com/PrimerAI/blanc/raw/master/data/DailyNews_300.json', f'{output_dir}/raw/DailyNews_300.json', force=force) download_url_to_file( 'https://github.com/PrimerAI/blanc/raw/master/data/DailyNews_300_aspects.json', f'{output_dir}/raw/DailyNews_300_aspects.json', force=force)
def setup(ldc2008t19_tgz: str, output_dir: str, force: bool) -> None: # Download the splits from https://github.com/jiacheng-xu/DiscoBERT/tree/release/data_preparation/urls_nyt for split in ['train', 'valid', 'test']: download_url_to_file( f'https://github.com/jiacheng-xu/DiscoBERT/raw/release/data_preparation/urls_nyt/mapping_{split}.txt', f'{output_dir}/raw/mapping_{split}.txt', force=force) train_ids = set( map( int, open(f'{output_dir}/raw/mapping_train.txt', 'r').read().splitlines())) valid_ids = set( map( int, open(f'{output_dir}/raw/mapping_valid.txt', 'r').read().splitlines())) test_ids = set( map( int, open(f'{output_dir}/raw/mapping_test.txt', 'r').read().splitlines())) all_ids = train_ids | valid_ids | test_ids train, valid, test = [], [], [] with tarfile.open(ldc2008t19_tgz, 'r') as tar: # Obtain the members to make the tqdm output make sense members = [] for member in tar.getmembers(): if not member.name.startswith( 'nyt_corpus/data') or not member.name.endswith('.tgz'): continue members.append(member) for member in tqdm(members): tar_bytes = tar.extractfile(member).read() with tarfile.open(fileobj=BytesIO(tar_bytes)) as inner_tar: for inner_member in inner_tar.getmembers(): if not inner_member.name.endswith('.xml'): continue # "01/26/1459527.xml" file_id = int(inner_member.name.split('/')[-1][:-4]) if file_id in all_ids: xml_bytes = inner_tar.extractfile(inner_member).read() instance = _extract_data(xml_bytes) if file_id in train_ids: train.append(instance) elif file_id in valid_ids: valid.append(instance) elif file_id in test_ids: test.append(instance) assert len( train) == 137778, f'Train has {len(train)} instances, expected 137778' assert len( valid) == 17222, f'Valid has {len(valid)} instances, expected 17222' assert len( test) == 17223, f'Test has {len(test)} instances, expected 17223' _save(train, f'{output_dir}/train.jsonl.gz') _save(valid, f'{output_dir}/valid.jsonl.gz') _save(test, f'{output_dir}/test.jsonl.gz')
def download_human_judgments(output_dir: str, force: bool) -> str: url = 'https://storage.googleapis.com/sfr-summarization-repo-research/model_annotations.aligned.jsonl' file_path = f'{output_dir}/model_annotations.aligned.jsonl' download_url_to_file(url, file_path, force) return file_path