Пример #1
0
def mix_datasets(datasets, props=None, new_dataset=None):
    if len(set(datasets)) == 1:
        return datasets[0]
    if props is None:
        props = [1 / len(datasets)] * len(datasets)
    assert len(props) == len(datasets)
    assert all([get_dataset_dir(dataset).exists() for dataset in datasets])
    # Sort in unison according to dataset names
    datasets, props = zip(*sorted(zip(datasets, props)))
    if new_dataset is None:
        new_dataset = 'mix-' + '-'.join([
            f'{dataset}_{prop:.2f}' for dataset, prop in zip(datasets, props)
        ])
    with create_directory_or_skip(get_dataset_dir(new_dataset)):
        print('Mixing datasets...')
        for phase, language in product(PHASES, LANGUAGES):
            input_files = [
                get_data_filepath(dataset, phase, language)
                for dataset in datasets
            ]
            # If one of the input files does not exist, we remove it and its prop and renormalize
            input_files, current_props = zip(
                *[(input_file, prop)
                  for input_file, prop in zip(input_files, props)
                  if input_file.exists()])
            current_props = np.array(current_props) / np.sum(current_props)
            output_file = get_data_filepath(new_dataset, phase, language)
            # TODO: Jointly mix files
            # The seed is set everytime mix is called, therefore they should be mixed in the same order
            mix_files(input_files, current_props, output_file)
            shuffle_file_lines(output_file)
    return new_dataset
Пример #2
0
def create_preprocessed_dataset(dataset, preprocessors, n_jobs=1):
    for preprocessor in preprocessors:
        # Fit preprocessor on input dataset
        preprocessor.fit(get_data_filepath(dataset, 'train', 'complex'),
                         get_data_filepath(dataset, 'train', 'simple'))
        dataset = create_preprocessed_dataset_one_preprocessor(
            dataset, preprocessor, n_jobs)
    return dataset
Пример #3
0
def check_dataset(dataset):
    # Sanity check with evaluation dataset
    if has_lines_in_common(get_data_filepath(dataset, 'train', 'complex'),
                           get_data_filepath('asset', 'valid', 'complex')):
        warnings.warn(
            'WARNING: Dataset has validation samples in training set!')
    if has_lines_in_common(get_data_filepath(dataset, 'train', 'complex'),
                           get_data_filepath('asset', 'test', 'complex')):
        warnings.warn('WARNING: Dataset has test samples in training set!')
Пример #4
0
def apply_line_function_to_dataset(line_function,
                                   dataset,
                                   new_dataset,
                                   languages=LANGUAGES):
    '''Provided function signature: line_function(line) -> line'''
    with create_directory_or_skip(get_dataset_dir(new_dataset)):
        for phase, language in product(PHASES, languages):
            source_filepath = get_data_filepath(dataset, phase, language)
            target_filepath = get_data_filepath(new_dataset, phase, language)
            if not source_filepath.exists():
                continue
            apply_line_function_to_file(line_function, source_filepath,
                                        target_filepath)
    return new_dataset
Пример #5
0
def mlm_fairseq_preprocess(dataset):
    '''Too specific for ts.fairseq.base.fairseq_preprocess'''
    dataset_dir = get_dataset_dir(dataset)
    with lock_directory(dataset_dir):
        preprocessed_dir = dataset_dir / 'fairseq_preprocessed'
        with create_directory_or_skip(preprocessed_dir):
            vocab_path = get_data_filepath(dataset, 'vocab', 'fr')
            assert vocab_path.exists()
            trainpref = get_data_filepath(dataset, 'train', 'fr')
            validpref = get_data_filepath(dataset, 'valid', 'fr')
            testpref = get_data_filepath(dataset, 'test', 'fr')
            command = f'fairseq-preprocess --only-source --trainpref {trainpref} --validpref {validpref} --testpref {testpref} --destdir {preprocessed_dir} --workers 64 --srcdict {vocab_path}'  # noqa
            print(command)
            run_command(command)
    return preprocessed_dir
Пример #6
0
def combine_simplifications_in_dataset(simplification_pairs, dataset):
    with create_directory_or_skip(get_dataset_dir(dataset)):
        assert len(simplification_pairs) > 30000, f'Not enough pairs: {len(simplification_pairs)}'
        indexes = np.random.permutation(len(simplification_pairs))
        for phase, start_index, end_index in [
            ('test', 10000, 20000),
            ('valid', 20000, 30000),
            ('train', 30000, len(indexes)),
        ]:
            with write_lines_in_parallel(
                [get_data_filepath(dataset, phase, 'complex'), get_data_filepath(dataset, phase, 'simple')]
            ) as files:
                for idx in tqdm(indexes[start_index:end_index]):
                    files.write(simplification_pairs[idx])
    return get_dataset_dir(dataset)
Пример #7
0
def finetune_and_predict_on_dataset(finetuning_dataset, exp_dir, **kwargs):
    kwargs['train_kwargs']['ngpus'] = 1
    prefix = 'finetune'
    if kwargs.get('fast_parametrization_search', False):
        prefix += '_fast'
    pred_filepaths = [
        exp_dir /
        f'{prefix}_{finetuning_dataset}_valid-test_{finetuning_dataset}_valid.pred',
        exp_dir /
        f'{prefix}_{finetuning_dataset}_valid-test_{finetuning_dataset}_test.pred',
    ]
    if all([path.exists() for path in pred_filepaths]):
        return
    for phase, pred_filepath in zip(['valid', 'test'], pred_filepaths):
        orig_sents_path = get_data_filepath(finetuning_dataset, phase,
                                            'complex')
        refs_sents_paths = list(
            get_dataset_dir(finetuning_dataset).glob(f'{phase}.simple*'))
        kwargs['evaluate_kwargs'] = {
            'test_set': 'custom',
            'orig_sents_path': orig_sents_path,
            'refs_sents_paths': refs_sents_paths,
        }
        if phase == 'valid':
            # Finetune preprocessors_kwargs only on valid
            kwargs['preprocessors_kwargs'] = find_best_parametrization(
                exp_dir, **kwargs)
        shutil.copyfile(
            fairseq_get_simplifier(exp_dir, **kwargs)(orig_sents_path),
            pred_filepath)
Пример #8
0
def get_transformer_kwargs(dataset,
                           language,
                           use_access,
                           use_short_name=False):
    kwargs = {
        'dataset': dataset,
        'parametrization_budget': 128,
        'predict_files': get_predict_files(language),
        'train_kwargs': {
            'ngpus': 8,
            'arch': 'bart_large',
            'max_tokens': 4096,
            'truncate_source': True,
            'layernorm_embedding': True,
            'share_all_embeddings': True,
            'share_decoder_input_output_embed': True,
            'required_batch_size_multiple': 1,
            'criterion': 'label_smoothed_cross_entropy',
            'lr': 3e-04,
            'label_smoothing': 0.1,
            'dropout': 0.1,
            'attention_dropout': 0.1,
            'weight_decay': 0.01,
            'optimizer': 'adam',
            'adam_betas': '(0.9, 0.999)',
            'adam_eps': 1e-08,
            'clip_norm': 0.1,
        },
        'preprocessors_kwargs': {
            'SentencePiecePreprocessor': {
                'vocab_size':
                32000,
                'input_filepaths': [
                    get_data_filepath(dataset, 'train', 'complex'),
                    get_data_filepath(dataset, 'train', 'simple'),
                ],
            }
            # 'SentencePiecePreprocessor': {'vocab_size': 32000, 'input_filepaths':  [get_dataset_dir('enwiki') / 'all_sentences']}
        },
        'evaluate_kwargs': get_evaluate_kwargs(language),
    }
    if use_access:
        kwargs['preprocessors_kwargs'] = add_dicts(
            get_access_preprocessors_kwargs(language,
                                            use_short_name=use_short_name),
            kwargs['preprocessors_kwargs'])
    return kwargs
Пример #9
0
def fairseq_evaluate_and_save(exp_dir, **kwargs):
    scores = fairseq_evaluate(exp_dir, **kwargs)
    print(f'scores={scores}')
    report_path = exp_dir / 'easse_report.html'
    shutil.move(get_easse_report_from_exp_dir(exp_dir, **kwargs), report_path)
    print(f'report_path={report_path}')
    predict_files = kwargs.get('predict_files', [
        get_data_filepath('asset', 'valid', 'complex'),
        get_data_filepath('asset', 'test', 'complex')
    ])
    for source_path in predict_files:
        pred_path = get_predictions(source_path, exp_dir, **kwargs)
        shutil.copyfile(source_path, exp_dir / source_path.name)
        new_pred_path = exp_dir / source_path.with_suffix('.pred').name
        shutil.move(pred_path, new_pred_path)
        print(f'source_path={source_path}')
        print(f'pred_path={new_pred_path}')
    return scores
Пример #10
0
def get_scores_on_dataset(pred_path, dataset, phase):
    orig_sents_path = get_data_filepath(dataset, phase, 'complex')
    refs_sents_paths = list(get_dataset_dir(dataset).glob(f'{phase}.simple*'))
    return evaluate_system_output(
        'custom',
        sys_sents_path=pred_path,
        orig_sents_path=orig_sents_path,
        refs_sents_paths=refs_sents_paths,
        metrics=['sari', 'bleu', 'fkgl', 'sari_by_operation'],
        quality_estimation=False,
    )
Пример #11
0
def fairseq_preprocess(dataset,
                       dict_path=None,
                       source_lang='complex',
                       target_lang='simple'):
    dataset_dir = get_dataset_dir(dataset)
    with lock_directory(dataset_dir):
        preprocessed_dir = dataset_dir / f'fairseq_preprocessed_{source_lang}-{target_lang}'
        with create_directory_or_skip(preprocessed_dir):
            # HACK
            for phase in PHASES:
                for language, new_language in zip(LANGUAGES,
                                                  [source_lang, target_lang]):
                    symlink_path = get_data_filepath(dataset, phase,
                                                     new_language)
                    if not symlink_path.exists():
                        symlink_path.symlink_to(
                            get_data_filepath(dataset, phase, language))
            trainpref = str(get_data_filepath(dataset, 'train',
                                              'dummy')).replace('.dummy', '')
            validpref = str(get_data_filepath(dataset, 'valid',
                                              'dummy')).replace('.dummy', '')
            testpref = str(get_data_filepath(dataset, 'test',
                                             'dummy')).replace('.dummy', '')
            args = f'''
                --source-lang {source_lang} --target-lang {target_lang} --trainpref {trainpref} --validpref {validpref} --testpref {testpref}
                --destdir {preprocessed_dir} --bpe sentencepiece
                --joined-dictionary --workers 32
            '''
            if dict_path is not None:
                args = f'{args} --srcdict {dict_path}'
            args = remove_multiple_whitespaces(args.replace('\n',
                                                            ' ')).strip(' ')
            print(f'fairseq-preprocess {args}')
            args = shlex.split(args)
            with mock_cli_args(args):
                preprocess.cli_main()
        return preprocessed_dir
Пример #12
0
def prepare_asset():
    print('ASSET')
    dataset = 'asset'
    with create_directory_or_skip(get_dataset_dir(dataset)):
        for phase in ('valid', 'test'):
            for i in range(10):
                for (old_language_name,
                     new_language_name) in [('orig', 'complex'),
                                            (f'simp.{i}', f'simple.{i}')]:
                    url = f'https://raw.githubusercontent.com/facebookresearch/asset/master/dataset/asset.{phase}.{old_language_name}'
                    old_path = download(url)
                    new_path = get_data_filepath(dataset, phase,
                                                 new_language_name)
                    shutil.copyfile(old_path, new_path)
                    add_newline_at_end_of_file(new_path)
    print('Done.')
Пример #13
0
def get_predict_files(language):
    return {
        'en': [
            get_data_filepath('asset', 'valid', 'complex'),
            get_data_filepath('asset', 'test', 'complex')
        ],
        'fr': [
            get_data_filepath('alector', 'valid', 'complex'),
            get_data_filepath('alector', 'test', 'complex')
        ],
        'es': [
            get_data_filepath('simplext_corpus', 'valid', 'complex'),
            get_data_filepath('simplext_corpus', 'test', 'complex'),
        ],
    }[language]
Пример #14
0
def get_evaluate_kwargs(language, phase='valid'):
    return {
        ('en', 'valid'): {
            'test_set': 'asset_valid'
        },
        ('en', 'test'): {
            'test_set': 'asset_test'
        },
        ('fr', 'valid'): {
            'test_set': 'custom',
            'orig_sents_path': get_data_filepath('alector', 'valid',
                                                 'complex'),
            'refs_sents_paths':
            [get_data_filepath('alector', 'valid', 'simple')],
        },
        ('fr', 'test'): {
            'test_set': 'custom',
            'orig_sents_path': get_data_filepath('alector', 'test', 'complex'),
            'refs_sents_paths':
            [get_data_filepath('alector', 'test', 'simple')],
        },
        ('es', 'valid'): {
            'test_set':
            'custom',
            'orig_sents_path':
            get_data_filepath('simplext_corpus', 'valid', 'complex'),
            'refs_sents_paths':
            [get_data_filepath('simplext_corpus', 'valid', 'simple')],
        },
        ('es', 'test'): {
            'test_set':
            'custom',
            'orig_sents_path':
            get_data_filepath('simplext_corpus', 'test', 'complex'),
            'refs_sents_paths':
            [get_data_filepath('simplext_corpus', 'test', 'simple')],
        },
    }[(language, phase)]
Пример #15
0
def prepare_wikilarge():
    print('WikiLarge')
    dataset = 'wikilarge'  # dataset = wikismall works as well
    with create_directory_or_skip(get_dataset_dir(dataset)):
        url = 'https://github.com/louismartin/dress-data/raw/master/data-simplification.tar.bz2'
        extracted_path = download_and_extract(url)[0]
        # Process
        print('Processing...')
        # Only rename files and put them in local directory architecture
        # FIXME: Wikilarge validations set only has 992 sentences
        for phase in PHASES:
            for (old_language_name, new_language_name) in [('src', 'complex'),
                                                           ('dst', 'simple')]:
                old_path_glob = os.path.join(
                    extracted_path, dataset,
                    f'*.ori.{phase}.{old_language_name}')
                globs = glob(old_path_glob)
                assert len(globs) == 1
                old_path = globs[0]
                new_path = get_data_filepath(dataset, phase, new_language_name)
                shutil.copyfile(old_path, new_path)
                shutil.move(replace_lrb_rrb_file(new_path), new_path)
                add_newline_at_end_of_file(new_path)
    print('Done.')
Пример #16
0
def get_all_baseline_rows():
    paths = {
        ('asset', 'test'): ('en', TEST_SETS_PATHS[('asset_test', 'orig')],
                            TEST_SETS_PATHS[('asset_test', 'refs')]),
        ('asset', 'valid'): ('en', TEST_SETS_PATHS[('asset_valid', 'orig')],
                             TEST_SETS_PATHS[('asset_valid', 'refs')]),
        ('turkcorpus_detokenized', 'test'): (
            'en',
            TEST_SETS_PATHS[('turkcorpus_test', 'orig')],
            TEST_SETS_PATHS[('turkcorpus_test', 'refs')],
        ),
        ('turkcorpus_detokenized', 'valid'): (
            'en',
            TEST_SETS_PATHS[('turkcorpus_valid', 'orig')],
            TEST_SETS_PATHS[('turkcorpus_valid', 'refs')],
        ),
        ('alector', 'test'): (
            'fr',
            get_data_filepath('alector', 'test', 'complex'),
            [get_data_filepath('alector', 'test', 'simple')],
        ),
        ('alector', 'valid'): (
            'fr',
            get_data_filepath('alector', 'valid', 'complex'),
            [get_data_filepath('alector', 'valid', 'simple')],
        ),
        # Old dataset with problems
        ('simplext_corpus_all', 'test'): (
            'es',
            get_data_filepath('simplext_corpus_all', 'test', 'complex'),
            [get_data_filepath('simplext_corpus_all', 'test', 'simple')],
        ),
        ('simplext_corpus_all', 'valid'): (
            'es',
            get_data_filepath('simplext_corpus_all', 'valid', 'complex'),
            [get_data_filepath('simplext_corpus_all', 'valid', 'simple')],
        ),
        ('simplext_corpus_all_fixed', 'test'): (
            'es',
            get_data_filepath('simplext_corpus_all_fixed', 'test', 'complex'),
            [get_data_filepath('simplext_corpus_all_fixed', 'test', 'simple')],
        ),
        ('simplext_corpus_all_fixed', 'valid'): (
            'es',
            get_data_filepath('simplext_corpus_all_fixed', 'valid', 'complex'),
            [
                get_data_filepath('simplext_corpus_all_fixed', 'valid',
                                  'simple')
            ],
        ),
        ('simpitiki', 'test'): (
            'it',
            get_data_filepath('simpitiki', 'test', 'complex'),
            [get_data_filepath('simpitiki', 'test', 'simple')],
        ),
        ('simpitiki', 'valid'): (
            'it',
            get_data_filepath('simpitiki', 'valid', 'complex'),
            [get_data_filepath('simpitiki', 'valid', 'simple')],
        ),
    }
    rows = []
    for (dataset, phase), (language, orig_sents_path,
                           refs_sents_paths) in tqdm(paths.items()):
        dataset_rows = get_baseline_rows(orig_sents_path,
                                         tuple(refs_sents_paths), language)
        for row in dataset_rows:
            row['dataset'] = dataset
            row['phase'] = phase
        rows.extend(dataset_rows)
    return rows