예제 #1
0
def mix_datasets(datasets, props=None, new_dataset=None):
    if len(set(datasets)) == 1:
        return datasets[0]
    if props is None:
        props = [1 / len(datasets)] * len(datasets)
    assert len(props) == len(datasets)
    assert all([get_dataset_dir(dataset).exists() for dataset in datasets])
    # Sort in unison according to dataset names
    datasets, props = zip(*sorted(zip(datasets, props)))
    if new_dataset is None:
        new_dataset = 'mix-' + '-'.join([
            f'{dataset}_{prop:.2f}' for dataset, prop in zip(datasets, props)
        ])
    with create_directory_or_skip(get_dataset_dir(new_dataset)):
        print('Mixing datasets...')
        for phase, language in product(PHASES, LANGUAGES):
            input_files = [
                get_data_filepath(dataset, phase, language)
                for dataset in datasets
            ]
            # If one of the input files does not exist, we remove it and its prop and renormalize
            input_files, current_props = zip(
                *[(input_file, prop)
                  for input_file, prop in zip(input_files, props)
                  if input_file.exists()])
            current_props = np.array(current_props) / np.sum(current_props)
            output_file = get_data_filepath(new_dataset, phase, language)
            # TODO: Jointly mix files
            # The seed is set everytime mix is called, therefore they should be mixed in the same order
            mix_files(input_files, current_props, output_file)
            shuffle_file_lines(output_file)
    return new_dataset
예제 #2
0
def create_preprocessed_dataset_one_preprocessor(dataset, preprocessor,
                                                 n_jobs):
    new_dataset = get_preprocessed_dataset_name(dataset, preprocessor)
    with create_directory_or_skip(get_dataset_dir(new_dataset)):
        print(
            f'Creating preprocessed dataset with {preprocessor}: {dataset} -> {new_dataset}'
        )
        new_dataset_dir = get_dataset_dir(new_dataset)
        filepaths_dict = get_filepaths_dict(dataset)
        new_filepaths_dict = get_filepaths_dict(new_dataset)
        for phase in PHASES:
            if not filepaths_dict[phase, 'complex'].exists(
            ) or not filepaths_dict[phase, 'complex'].exists():
                continue
            parallel_file_pair_preprocessor = get_parallel_file_pair_preprocessor(
                preprocessor.encode_file_pair,
                n_jobs=n_jobs,
            )
            parallel_file_pair_preprocessor(
                filepaths_dict[phase, 'complex'],
                filepaths_dict[phase, 'simple'],
                new_filepaths_dict[phase, 'complex'],
                new_filepaths_dict[phase, 'simple'],
            )
        previous_preprocessors = load_preprocessors(get_dataset_dir(dataset))
        if previous_preprocessors is not None:
            preprocessors = previous_preprocessors + [preprocessor]
        else:
            preprocessors = [preprocessor]
        dump_preprocessors(preprocessors, new_dataset_dir)
        with open(new_dataset_dir / 'original_dataset', 'w') as f:
            f.write(dataset + '\n')
        if hasattr(preprocessor, 'copy_sentencepiece_files_to_dir'):
            preprocessor.copy_sentencepiece_files_to_dir(new_dataset_dir)
    return new_dataset
예제 #3
0
def combine_simplifications_in_dataset(simplification_pairs, dataset):
    with create_directory_or_skip(get_dataset_dir(dataset)):
        assert len(simplification_pairs) > 30000, f'Not enough pairs: {len(simplification_pairs)}'
        indexes = np.random.permutation(len(simplification_pairs))
        for phase, start_index, end_index in [
            ('test', 10000, 20000),
            ('valid', 20000, 30000),
            ('train', 30000, len(indexes)),
        ]:
            with write_lines_in_parallel(
                [get_data_filepath(dataset, phase, 'complex'), get_data_filepath(dataset, phase, 'simple')]
            ) as files:
                for idx in tqdm(indexes[start_index:end_index]):
                    files.write(simplification_pairs[idx])
    return get_dataset_dir(dataset)
예제 #4
0
def finetune_and_predict_on_dataset(finetuning_dataset, exp_dir, **kwargs):
    kwargs['train_kwargs']['ngpus'] = 1
    prefix = 'finetune'
    if kwargs.get('fast_parametrization_search', False):
        prefix += '_fast'
    pred_filepaths = [
        exp_dir /
        f'{prefix}_{finetuning_dataset}_valid-test_{finetuning_dataset}_valid.pred',
        exp_dir /
        f'{prefix}_{finetuning_dataset}_valid-test_{finetuning_dataset}_test.pred',
    ]
    if all([path.exists() for path in pred_filepaths]):
        return
    for phase, pred_filepath in zip(['valid', 'test'], pred_filepaths):
        orig_sents_path = get_data_filepath(finetuning_dataset, phase,
                                            'complex')
        refs_sents_paths = list(
            get_dataset_dir(finetuning_dataset).glob(f'{phase}.simple*'))
        kwargs['evaluate_kwargs'] = {
            'test_set': 'custom',
            'orig_sents_path': orig_sents_path,
            'refs_sents_paths': refs_sents_paths,
        }
        if phase == 'valid':
            # Finetune preprocessors_kwargs only on valid
            kwargs['preprocessors_kwargs'] = find_best_parametrization(
                exp_dir, **kwargs)
        shutil.copyfile(
            fairseq_get_simplifier(exp_dir, **kwargs)(orig_sents_path),
            pred_filepath)
예제 #5
0
def get_scores_on_dataset(pred_path, dataset, phase):
    orig_sents_path = get_data_filepath(dataset, phase, 'complex')
    refs_sents_paths = list(get_dataset_dir(dataset).glob(f'{phase}.simple*'))
    return evaluate_system_output(
        'custom',
        sys_sents_path=pred_path,
        orig_sents_path=orig_sents_path,
        refs_sents_paths=refs_sents_paths,
        metrics=['sari', 'bleu', 'fkgl', 'sari_by_operation'],
        quality_estimation=False,
    )
예제 #6
0
def apply_line_function_to_dataset(line_function,
                                   dataset,
                                   new_dataset,
                                   languages=LANGUAGES):
    '''Provided function signature: line_function(line) -> line'''
    with create_directory_or_skip(get_dataset_dir(new_dataset)):
        for phase, language in product(PHASES, languages):
            source_filepath = get_data_filepath(dataset, phase, language)
            target_filepath = get_data_filepath(new_dataset, phase, language)
            if not source_filepath.exists():
                continue
            apply_line_function_to_file(line_function, source_filepath,
                                        target_filepath)
    return new_dataset
예제 #7
0
def create_smaller_dataset(dataset, n_lines):
    new_dataset = f'{dataset}-lines{n_lines}'
    with create_directory_or_skip(get_dataset_dir(new_dataset)):
        filepaths_dict = get_filepaths_dict(dataset)
        new_filepaths_dict = get_filepaths_dict(new_dataset)
        for phase, language in product(['train'], LANGUAGES):
            with open(new_filepaths_dict[(phase, language)],
                      'w') as output_file:
                for line in yield_lines(filepaths_dict[(phase, language)],
                                        n_lines=n_lines):
                    output_file.write(line + '\n')
        for phase, language in product(['valid', 'test'], LANGUAGES):
            shutil.copy(filepaths_dict[(phase, language)],
                        new_filepaths_dict[(phase, language)])
    return new_dataset
예제 #8
0
def mlm_fairseq_preprocess(dataset):
    '''Too specific for ts.fairseq.base.fairseq_preprocess'''
    dataset_dir = get_dataset_dir(dataset)
    with lock_directory(dataset_dir):
        preprocessed_dir = dataset_dir / 'fairseq_preprocessed'
        with create_directory_or_skip(preprocessed_dir):
            vocab_path = get_data_filepath(dataset, 'vocab', 'fr')
            assert vocab_path.exists()
            trainpref = get_data_filepath(dataset, 'train', 'fr')
            validpref = get_data_filepath(dataset, 'valid', 'fr')
            testpref = get_data_filepath(dataset, 'test', 'fr')
            command = f'fairseq-preprocess --only-source --trainpref {trainpref} --validpref {validpref} --testpref {testpref} --destdir {preprocessed_dir} --workers 64 --srcdict {vocab_path}'  # noqa
            print(command)
            run_command(command)
    return preprocessed_dir
예제 #9
0
def prepare_asset():
    print('ASSET')
    dataset = 'asset'
    with create_directory_or_skip(get_dataset_dir(dataset)):
        for phase in ('valid', 'test'):
            for i in range(10):
                for (old_language_name,
                     new_language_name) in [('orig', 'complex'),
                                            (f'simp.{i}', f'simple.{i}')]:
                    url = f'https://raw.githubusercontent.com/facebookresearch/asset/master/dataset/asset.{phase}.{old_language_name}'
                    old_path = download(url)
                    new_path = get_data_filepath(dataset, phase,
                                                 new_language_name)
                    shutil.copyfile(old_path, new_path)
                    add_newline_at_end_of_file(new_path)
    print('Done.')
예제 #10
0
def fairseq_prepare_and_train(dataset, **kwargs):
    check_dataset(dataset)
    kwargs = check_and_resolve_args(kwargs)
    exp_dir = prepare_exp_dir()
    preprocessors_kwargs = kwargs.get('preprocessors_kwargs', {})
    preprocessors = get_preprocessors(preprocessors_kwargs)
    if len(preprocessors) > 0:
        dataset = create_preprocessed_dataset(dataset, preprocessors, n_jobs=8)
        dataset_dir = get_dataset_dir(dataset)
        shutil.copy(dataset_dir / 'preprocessors.pickle', exp_dir)
        if hasattr(preprocessors[-1], 'copy_sentencepiece_files_to_dir'):
            preprocessors[-1].copy_sentencepiece_files_to_dir(dataset_dir)
    model_symlink_path = exp_dir / 'model.pt'
    if not model_symlink_path.exists():
        model_symlink_path.symlink_to('checkpoints/checkpoint_best.pt')
    preprocessed_dir = fairseq_preprocess(
        dataset, **kwargs.get('preprocess_kwargs', {}))
    train_kwargs = kwargs.get('train_kwargs', {})
    fairseq_train(preprocessed_dir, exp_dir=exp_dir, **train_kwargs)
    return exp_dir
예제 #11
0
def fairseq_preprocess(dataset,
                       dict_path=None,
                       source_lang='complex',
                       target_lang='simple'):
    dataset_dir = get_dataset_dir(dataset)
    with lock_directory(dataset_dir):
        preprocessed_dir = dataset_dir / f'fairseq_preprocessed_{source_lang}-{target_lang}'
        with create_directory_or_skip(preprocessed_dir):
            # HACK
            for phase in PHASES:
                for language, new_language in zip(LANGUAGES,
                                                  [source_lang, target_lang]):
                    symlink_path = get_data_filepath(dataset, phase,
                                                     new_language)
                    if not symlink_path.exists():
                        symlink_path.symlink_to(
                            get_data_filepath(dataset, phase, language))
            trainpref = str(get_data_filepath(dataset, 'train',
                                              'dummy')).replace('.dummy', '')
            validpref = str(get_data_filepath(dataset, 'valid',
                                              'dummy')).replace('.dummy', '')
            testpref = str(get_data_filepath(dataset, 'test',
                                             'dummy')).replace('.dummy', '')
            args = f'''
                --source-lang {source_lang} --target-lang {target_lang} --trainpref {trainpref} --validpref {validpref} --testpref {testpref}
                --destdir {preprocessed_dir} --bpe sentencepiece
                --joined-dictionary --workers 32
            '''
            if dict_path is not None:
                args = f'{args} --srcdict {dict_path}'
            args = remove_multiple_whitespaces(args.replace('\n',
                                                            ' ')).strip(' ')
            print(f'fairseq-preprocess {args}')
            args = shlex.split(args)
            with mock_cli_args(args):
                preprocess.cli_main()
        return preprocessed_dir
예제 #12
0
def prepare_wikilarge():
    print('WikiLarge')
    dataset = 'wikilarge'  # dataset = wikismall works as well
    with create_directory_or_skip(get_dataset_dir(dataset)):
        url = 'https://github.com/louismartin/dress-data/raw/master/data-simplification.tar.bz2'
        extracted_path = download_and_extract(url)[0]
        # Process
        print('Processing...')
        # Only rename files and put them in local directory architecture
        # FIXME: Wikilarge validations set only has 992 sentences
        for phase in PHASES:
            for (old_language_name, new_language_name) in [('src', 'complex'),
                                                           ('dst', 'simple')]:
                old_path_glob = os.path.join(
                    extracted_path, dataset,
                    f'*.ori.{phase}.{old_language_name}')
                globs = glob(old_path_glob)
                assert len(globs) == 1
                old_path = globs[0]
                new_path = get_data_filepath(dataset, phase, new_language_name)
                shutil.copyfile(old_path, new_path)
                shutil.move(replace_lrb_rrb_file(new_path), new_path)
                add_newline_at_end_of_file(new_path)
    print('Done.')
예제 #13
0
def get_original_dataset(dataset):
    filepath = get_dataset_dir(dataset) / 'original_dataset'
    if not filepath.exists():
        return None
    [original_dataset] = read_lines(filepath)
    return original_dataset
예제 #14
0
def train_roberta(
    dataset,
    sample_break_mode='complete',
    batch_size=8192,
    max_sentences=16,
    max_tokens=12000,
    tokens_per_sample=512,
    checkpoints_dir=None,
    distributed_world_size=None,
    sentencepiece_model_path=None,
    arch='roberta_base',
    dropout=0.1,
    total_updates=500000,
    log_interval=100,
    peak_lr=0.0007,
    clip_norm=None,
    no_epoch_checkpoint=False,
    validate_interval=1,
    save_interval=1,
    save_interval_updates=5000,
    warmup_updates=10000,
):
    preprocessed_dir = mlm_fairseq_preprocess(dataset)
    if checkpoints_dir is None:
        checkpoints_dir = get_fairseq_exp_dir() / 'checkpoints'
    checkpoints_dir = Path(checkpoints_dir)
    checkpoints_dir.mkdir(parents=True, exist_ok=True)
    shutil.copy(
        get_dataset_dir(dataset) / 'sentencepiece.bpe.model', checkpoints_dir)
    shutil.copy(
        get_dataset_dir(dataset) / 'fairseq_preprocessed/dict.txt',
        checkpoints_dir)
    effective_batch_size = max_sentences * distributed_world_size
    # assert batch_size % effective_batch_size == 0
    update_freq = int(round(batch_size / effective_batch_size, 0))
    print(f'batch_size={effective_batch_size * update_freq}')
    command = f'''
    eval "$(conda shell.bash hook)"  # Needed to use conda activate in subshells
    conda activate bert_fr

	fairseq-train  {preprocessed_dir} \
        --save-dir {checkpoints_dir} \
        --task masked_lm --criterion masked_lm \
        --arch {arch} --sample-break-mode {sample_break_mode} --tokens-per-sample {tokens_per_sample} \
        --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --clip-norm 0.0 \
        --lr-scheduler polynomial_decay --lr {peak_lr} --warmup-updates {warmup_updates} --total-num-update {total_updates} \
        --dropout {dropout} --attention-dropout {dropout} --weight-decay 0.01 \
        --max-sentences {max_sentences} --update-freq {update_freq} \
        --max-update {total_updates} --log-format simple --log-interval {log_interval} \
        --skip-invalid-size-inputs-valid-test \
        --save-interval-updates {save_interval_updates} \
        --validate-interval {validate_interval} --save-interval {save_interval} \
        --tensorboard-logdir {TENSORBOARD_LOGS_DIR} --fast-stat-sync \
        --fp16 --seed 1
    '''  # noqa
    command = command.strip(' ').strip('\n')
    if distributed_world_size is not None:
        command += f' --distributed-world-size {distributed_world_size} --distributed-port 53005'  # noqa
    if sentencepiece_model_path is not None:
        command += f' --bpe sentencepiece --sentencepiece-vocab {sentencepiece_model_path} --mask-whole-words'
    if max_tokens is not None:
        command += f' --max-tokens {max_tokens}'
    if clip_norm is not None:
        command += f' --clip-norm {clip_norm}'
    if no_epoch_checkpoint:
        command += ' --no-epoch-checkpoints'
    command = re.sub(' +', ' ', command)  # Remove multiple whitespaces
    print(command)
    run_command(command)
예제 #15
0
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import faiss

from muss.mining.preprocessing import create_base_index, get_index_name, get_sentences_paths
from muss.utils.helpers import yield_lines
from muss.laser import get_laser_embeddings
from muss.resources.paths import get_dataset_dir

# Create index
language = 'en'
n_train_sentences = 1000000
train_sentences = []
for sentences_path in get_sentences_paths(language='en'):
    for sentence in yield_lines(sentences_path):
        train_sentences.append(sentence)
        if len(train_sentences) == n_train_sentences:
            break
    if len(train_sentences) == n_train_sentences:
        break

get_embeddings = lambda sentences: get_laser_embeddings(
    sentences, max_tokens=3000, language=language)  # noqa: E731
output_dir = get_dataset_dir('uts') / f'base_indexes/laser_{language}'
output_dir.mkdir(exist_ok=True)
create_base_index(train_sentences, get_index_name(), get_embeddings,
                  faiss.METRIC_INNER_PRODUCT, output_dir)
예제 #16
0
    compute_and_save_simplification_pairs,
    get_index_path,
    compute_and_save_embeddings,
    get_filter_string_representation,
    combine_simplifications_in_dataset,
    get_simplification_pairs_paths,
)
from muss.mining.filtering import SimplicityScorer

ccnet_dir = Path(
    input(
        'Please download the CCNet corpus from https://github.com/facebookresearch/cc_net and enter the path to the downloaded data: '
    ))
language = input('What language do you want to process? (en/fr/es): ')
cluster = 'local'
dataset_dir = get_dataset_dir('uts') / language
# For large jobs only
slurm_partition = 'dev,scavenge'
slurm_array_parallelism = 1024

# Split CCNet shards into subshards
with log_action('Splitting CCNet shards into smaller subshards'):
    # We need to split each shard even more for the LASER embeddings to fit in memory
    n_shards = {  # Number of shards to take for each languages for ~1B sentences
        'en': 15,
        'fr': 25,
        'es':
        13,  # We would need about 20 shards for 1B sentences, but there are only 13
    }[language]
    ccnet_filepaths = [
        ccnet_dir / f'{language}_head_{i:04d}.json.gz' for i in range(n_shards)