예제 #1
0
    def run(self, args):
        self._notify_about_license()

        # Download the dataset so each individual setup does not need
        # to repeat this. The dataset comes from
        # http://multiling.iit.demokritos.gr/file/view/353/tac-2011-multiling-pilot-dataset-all-files-source-texts-human-and-system-summaries-evaluation-data
        data_path = f'{args.output_dir}/PublishedCorpusFileSet.zip'
        download_url_to_file(
            'http://multiling.iit.demokritos.gr/file/download/353', data_path)

        task.setup(data_path, args.output_dir)
        metrics.setup(data_path, args.output_dir)
예제 #2
0
def download_system_outputs(output_dir: str, force: bool) -> str:
    expanded_dir = f'{output_dir}/expanded/'
    for system_id in range(0, 24):
        url = f'https://storage.googleapis.com/sfr-summarization-repo-research/M{system_id}.tar.gz'
        file_path = f'{output_dir}/M{system_id}.tar.gz'
        download_url_to_file(url, file_path, force)

        model_expanded_dir = f'{expanded_dir}/M{system_id}'
        if os.path.exists(model_expanded_dir) and force:
            shutil.rmtree(model_expanded_dir)

        if not os.path.exists(model_expanded_dir):
            with tarfile.open(file_path, 'r') as tar:
                tar.extractall(expanded_dir)
    return expanded_dir
예제 #3
0
def setup(output_dir: str, force: bool = False) -> None:
    download_raw_data(output_dir, force)
    abs_raw_data, ext_raw_data = load_all_data(output_dir)

    download_url_to_file(
        'https://github.com/neulab/REALSumm/raw/master/scores_dicts/abs.pkl',
        f'{output_dir}/raw/abs.pkl',
        force=force)
    download_url_to_file(
        'https://github.com/neulab/REALSumm/raw/master/scores_dicts/ext.pkl',
        f'{output_dir}/raw/ext.pkl',
        force=force)

    abstractive = pickle.load(open(f'{output_dir}/raw/abs.pkl', 'rb'))
    extractive = pickle.load(open(f'{output_dir}/raw/ext.pkl', 'rb'))

    abs_filtered = filter_to_scored_summaries(abs_raw_data, abstractive)
    ext_filtered = filter_to_scored_summaries(ext_raw_data, extractive)

    abs_instances, abs_metrics_list = convert_to_sacrerouge_instances_and_metrics(
        abstractive, abs_filtered, 'abs')
    ext_instances, ext_metrics_list = convert_to_sacrerouge_instances_and_metrics(
        extractive, ext_filtered, 'ext')

    save_to_jsonl(abs_instances, f'{output_dir}/summaries-abs.jsonl')
    save_to_jsonl(ext_instances, f'{output_dir}/summaries-ext.jsonl')
    save_to_jsonl(abs_instances + ext_instances,
                  f'{output_dir}/summaries-mix.jsonl')

    save_to_jsonl(abs_metrics_list, f'{output_dir}/metrics-abs.jsonl')
    save_to_jsonl(ext_metrics_list, f'{output_dir}/metrics-ext.jsonl')
    save_to_jsonl(abs_metrics_list + ext_metrics_list,
                  f'{output_dir}/metrics-mix.jsonl')

    all_abs = collect_all_outputs(abs_raw_data)
    all_ext = collect_all_outputs(ext_raw_data)
    all_mix = collect_all_outputs({**abs_raw_data, **ext_raw_data})

    save_to_jsonl(all_abs, f'{output_dir}/all-summaries-abs.jsonl.gz')
    save_to_jsonl(all_ext, f'{output_dir}/all-summaries-ext.jsonl.gz')
    save_to_jsonl(all_mix, f'{output_dir}/all-summaries-mix.jsonl.gz')

    print_stats(all_mix)
예제 #4
0
def _download_ids(output_dir: str, force: bool) -> None:
    download_url_to_file(
        'https://raw.githubusercontent.com/Alex-Fabbri/Multi-News/master/data/ids/train.id',
        f'{output_dir}/train.id',
        force=force)
    download_url_to_file(
        'https://raw.githubusercontent.com/Alex-Fabbri/Multi-News/master/data/ids/val.id',
        f'{output_dir}/val.id',
        force=force)
    download_url_to_file(
        'https://raw.githubusercontent.com/Alex-Fabbri/Multi-News/master/data/ids/test.id',
        f'{output_dir}/test.id',
        force=force)
예제 #5
0
def _download_raw_data(output_dir: str, force: bool) -> None:
    download_url_to_file(
        'https://github.com/PrimerAI/blanc/raw/master/data/CNN_DailyMail_555.json',
        f'{output_dir}/raw/CNN_DailyMail_555.json',
        force=force)
    download_url_to_file(
        'https://github.com/PrimerAI/blanc/raw/master/data/DailyNews_300.json',
        f'{output_dir}/raw/DailyNews_300.json',
        force=force)
    download_url_to_file(
        'https://github.com/PrimerAI/blanc/raw/master/data/DailyNews_300_aspects.json',
        f'{output_dir}/raw/DailyNews_300_aspects.json',
        force=force)
예제 #6
0
def setup(ldc2008t19_tgz: str, output_dir: str, force: bool) -> None:
    # Download the splits from https://github.com/jiacheng-xu/DiscoBERT/tree/release/data_preparation/urls_nyt
    for split in ['train', 'valid', 'test']:
        download_url_to_file(
            f'https://github.com/jiacheng-xu/DiscoBERT/raw/release/data_preparation/urls_nyt/mapping_{split}.txt',
            f'{output_dir}/raw/mapping_{split}.txt',
            force=force)

    train_ids = set(
        map(
            int,
            open(f'{output_dir}/raw/mapping_train.txt',
                 'r').read().splitlines()))
    valid_ids = set(
        map(
            int,
            open(f'{output_dir}/raw/mapping_valid.txt',
                 'r').read().splitlines()))
    test_ids = set(
        map(
            int,
            open(f'{output_dir}/raw/mapping_test.txt',
                 'r').read().splitlines()))
    all_ids = train_ids | valid_ids | test_ids

    train, valid, test = [], [], []
    with tarfile.open(ldc2008t19_tgz, 'r') as tar:
        # Obtain the members to make the tqdm output make sense
        members = []
        for member in tar.getmembers():
            if not member.name.startswith(
                    'nyt_corpus/data') or not member.name.endswith('.tgz'):
                continue
            members.append(member)

        for member in tqdm(members):
            tar_bytes = tar.extractfile(member).read()
            with tarfile.open(fileobj=BytesIO(tar_bytes)) as inner_tar:
                for inner_member in inner_tar.getmembers():
                    if not inner_member.name.endswith('.xml'):
                        continue

                    # "01/26/1459527.xml"
                    file_id = int(inner_member.name.split('/')[-1][:-4])
                    if file_id in all_ids:
                        xml_bytes = inner_tar.extractfile(inner_member).read()
                        instance = _extract_data(xml_bytes)

                        if file_id in train_ids:
                            train.append(instance)
                        elif file_id in valid_ids:
                            valid.append(instance)
                        elif file_id in test_ids:
                            test.append(instance)

    assert len(
        train) == 137778, f'Train has {len(train)} instances, expected 137778'
    assert len(
        valid) == 17222, f'Valid has {len(valid)} instances, expected 17222'
    assert len(
        test) == 17223, f'Test has {len(test)} instances, expected 17223'

    _save(train, f'{output_dir}/train.jsonl.gz')
    _save(valid, f'{output_dir}/valid.jsonl.gz')
    _save(test, f'{output_dir}/test.jsonl.gz')
예제 #7
0
def download_human_judgments(output_dir: str, force: bool) -> str:
    url = 'https://storage.googleapis.com/sfr-summarization-repo-research/model_annotations.aligned.jsonl'
    file_path = f'{output_dir}/model_annotations.aligned.jsonl'
    download_url_to_file(url, file_path, force)
    return file_path