def mix_datasets(datasets, props=None, new_dataset=None): if len(set(datasets)) == 1: return datasets[0] if props is None: props = [1 / len(datasets)] * len(datasets) assert len(props) == len(datasets) assert all([get_dataset_dir(dataset).exists() for dataset in datasets]) # Sort in unison according to dataset names datasets, props = zip(*sorted(zip(datasets, props))) if new_dataset is None: new_dataset = 'mix-' + '-'.join([ f'{dataset}_{prop:.2f}' for dataset, prop in zip(datasets, props) ]) with create_directory_or_skip(get_dataset_dir(new_dataset)): print('Mixing datasets...') for phase, language in product(PHASES, LANGUAGES): input_files = [ get_data_filepath(dataset, phase, language) for dataset in datasets ] # If one of the input files does not exist, we remove it and its prop and renormalize input_files, current_props = zip( *[(input_file, prop) for input_file, prop in zip(input_files, props) if input_file.exists()]) current_props = np.array(current_props) / np.sum(current_props) output_file = get_data_filepath(new_dataset, phase, language) # TODO: Jointly mix files # The seed is set everytime mix is called, therefore they should be mixed in the same order mix_files(input_files, current_props, output_file) shuffle_file_lines(output_file) return new_dataset
def create_preprocessed_dataset_one_preprocessor(dataset, preprocessor, n_jobs): new_dataset = get_preprocessed_dataset_name(dataset, preprocessor) with create_directory_or_skip(get_dataset_dir(new_dataset)): print( f'Creating preprocessed dataset with {preprocessor}: {dataset} -> {new_dataset}' ) new_dataset_dir = get_dataset_dir(new_dataset) filepaths_dict = get_filepaths_dict(dataset) new_filepaths_dict = get_filepaths_dict(new_dataset) for phase in PHASES: if not filepaths_dict[phase, 'complex'].exists( ) or not filepaths_dict[phase, 'complex'].exists(): continue parallel_file_pair_preprocessor = get_parallel_file_pair_preprocessor( preprocessor.encode_file_pair, n_jobs=n_jobs, ) parallel_file_pair_preprocessor( filepaths_dict[phase, 'complex'], filepaths_dict[phase, 'simple'], new_filepaths_dict[phase, 'complex'], new_filepaths_dict[phase, 'simple'], ) previous_preprocessors = load_preprocessors(get_dataset_dir(dataset)) if previous_preprocessors is not None: preprocessors = previous_preprocessors + [preprocessor] else: preprocessors = [preprocessor] dump_preprocessors(preprocessors, new_dataset_dir) with open(new_dataset_dir / 'original_dataset', 'w') as f: f.write(dataset + '\n') if hasattr(preprocessor, 'copy_sentencepiece_files_to_dir'): preprocessor.copy_sentencepiece_files_to_dir(new_dataset_dir) return new_dataset
def combine_simplifications_in_dataset(simplification_pairs, dataset): with create_directory_or_skip(get_dataset_dir(dataset)): assert len(simplification_pairs) > 30000, f'Not enough pairs: {len(simplification_pairs)}' indexes = np.random.permutation(len(simplification_pairs)) for phase, start_index, end_index in [ ('test', 10000, 20000), ('valid', 20000, 30000), ('train', 30000, len(indexes)), ]: with write_lines_in_parallel( [get_data_filepath(dataset, phase, 'complex'), get_data_filepath(dataset, phase, 'simple')] ) as files: for idx in tqdm(indexes[start_index:end_index]): files.write(simplification_pairs[idx]) return get_dataset_dir(dataset)
def finetune_and_predict_on_dataset(finetuning_dataset, exp_dir, **kwargs): kwargs['train_kwargs']['ngpus'] = 1 prefix = 'finetune' if kwargs.get('fast_parametrization_search', False): prefix += '_fast' pred_filepaths = [ exp_dir / f'{prefix}_{finetuning_dataset}_valid-test_{finetuning_dataset}_valid.pred', exp_dir / f'{prefix}_{finetuning_dataset}_valid-test_{finetuning_dataset}_test.pred', ] if all([path.exists() for path in pred_filepaths]): return for phase, pred_filepath in zip(['valid', 'test'], pred_filepaths): orig_sents_path = get_data_filepath(finetuning_dataset, phase, 'complex') refs_sents_paths = list( get_dataset_dir(finetuning_dataset).glob(f'{phase}.simple*')) kwargs['evaluate_kwargs'] = { 'test_set': 'custom', 'orig_sents_path': orig_sents_path, 'refs_sents_paths': refs_sents_paths, } if phase == 'valid': # Finetune preprocessors_kwargs only on valid kwargs['preprocessors_kwargs'] = find_best_parametrization( exp_dir, **kwargs) shutil.copyfile( fairseq_get_simplifier(exp_dir, **kwargs)(orig_sents_path), pred_filepath)
def get_scores_on_dataset(pred_path, dataset, phase): orig_sents_path = get_data_filepath(dataset, phase, 'complex') refs_sents_paths = list(get_dataset_dir(dataset).glob(f'{phase}.simple*')) return evaluate_system_output( 'custom', sys_sents_path=pred_path, orig_sents_path=orig_sents_path, refs_sents_paths=refs_sents_paths, metrics=['sari', 'bleu', 'fkgl', 'sari_by_operation'], quality_estimation=False, )
def apply_line_function_to_dataset(line_function, dataset, new_dataset, languages=LANGUAGES): '''Provided function signature: line_function(line) -> line''' with create_directory_or_skip(get_dataset_dir(new_dataset)): for phase, language in product(PHASES, languages): source_filepath = get_data_filepath(dataset, phase, language) target_filepath = get_data_filepath(new_dataset, phase, language) if not source_filepath.exists(): continue apply_line_function_to_file(line_function, source_filepath, target_filepath) return new_dataset
def create_smaller_dataset(dataset, n_lines): new_dataset = f'{dataset}-lines{n_lines}' with create_directory_or_skip(get_dataset_dir(new_dataset)): filepaths_dict = get_filepaths_dict(dataset) new_filepaths_dict = get_filepaths_dict(new_dataset) for phase, language in product(['train'], LANGUAGES): with open(new_filepaths_dict[(phase, language)], 'w') as output_file: for line in yield_lines(filepaths_dict[(phase, language)], n_lines=n_lines): output_file.write(line + '\n') for phase, language in product(['valid', 'test'], LANGUAGES): shutil.copy(filepaths_dict[(phase, language)], new_filepaths_dict[(phase, language)]) return new_dataset
def mlm_fairseq_preprocess(dataset): '''Too specific for ts.fairseq.base.fairseq_preprocess''' dataset_dir = get_dataset_dir(dataset) with lock_directory(dataset_dir): preprocessed_dir = dataset_dir / 'fairseq_preprocessed' with create_directory_or_skip(preprocessed_dir): vocab_path = get_data_filepath(dataset, 'vocab', 'fr') assert vocab_path.exists() trainpref = get_data_filepath(dataset, 'train', 'fr') validpref = get_data_filepath(dataset, 'valid', 'fr') testpref = get_data_filepath(dataset, 'test', 'fr') command = f'fairseq-preprocess --only-source --trainpref {trainpref} --validpref {validpref} --testpref {testpref} --destdir {preprocessed_dir} --workers 64 --srcdict {vocab_path}' # noqa print(command) run_command(command) return preprocessed_dir
def prepare_asset(): print('ASSET') dataset = 'asset' with create_directory_or_skip(get_dataset_dir(dataset)): for phase in ('valid', 'test'): for i in range(10): for (old_language_name, new_language_name) in [('orig', 'complex'), (f'simp.{i}', f'simple.{i}')]: url = f'https://raw.githubusercontent.com/facebookresearch/asset/master/dataset/asset.{phase}.{old_language_name}' old_path = download(url) new_path = get_data_filepath(dataset, phase, new_language_name) shutil.copyfile(old_path, new_path) add_newline_at_end_of_file(new_path) print('Done.')
def fairseq_prepare_and_train(dataset, **kwargs): check_dataset(dataset) kwargs = check_and_resolve_args(kwargs) exp_dir = prepare_exp_dir() preprocessors_kwargs = kwargs.get('preprocessors_kwargs', {}) preprocessors = get_preprocessors(preprocessors_kwargs) if len(preprocessors) > 0: dataset = create_preprocessed_dataset(dataset, preprocessors, n_jobs=8) dataset_dir = get_dataset_dir(dataset) shutil.copy(dataset_dir / 'preprocessors.pickle', exp_dir) if hasattr(preprocessors[-1], 'copy_sentencepiece_files_to_dir'): preprocessors[-1].copy_sentencepiece_files_to_dir(dataset_dir) model_symlink_path = exp_dir / 'model.pt' if not model_symlink_path.exists(): model_symlink_path.symlink_to('checkpoints/checkpoint_best.pt') preprocessed_dir = fairseq_preprocess( dataset, **kwargs.get('preprocess_kwargs', {})) train_kwargs = kwargs.get('train_kwargs', {}) fairseq_train(preprocessed_dir, exp_dir=exp_dir, **train_kwargs) return exp_dir
def fairseq_preprocess(dataset, dict_path=None, source_lang='complex', target_lang='simple'): dataset_dir = get_dataset_dir(dataset) with lock_directory(dataset_dir): preprocessed_dir = dataset_dir / f'fairseq_preprocessed_{source_lang}-{target_lang}' with create_directory_or_skip(preprocessed_dir): # HACK for phase in PHASES: for language, new_language in zip(LANGUAGES, [source_lang, target_lang]): symlink_path = get_data_filepath(dataset, phase, new_language) if not symlink_path.exists(): symlink_path.symlink_to( get_data_filepath(dataset, phase, language)) trainpref = str(get_data_filepath(dataset, 'train', 'dummy')).replace('.dummy', '') validpref = str(get_data_filepath(dataset, 'valid', 'dummy')).replace('.dummy', '') testpref = str(get_data_filepath(dataset, 'test', 'dummy')).replace('.dummy', '') args = f''' --source-lang {source_lang} --target-lang {target_lang} --trainpref {trainpref} --validpref {validpref} --testpref {testpref} --destdir {preprocessed_dir} --bpe sentencepiece --joined-dictionary --workers 32 ''' if dict_path is not None: args = f'{args} --srcdict {dict_path}' args = remove_multiple_whitespaces(args.replace('\n', ' ')).strip(' ') print(f'fairseq-preprocess {args}') args = shlex.split(args) with mock_cli_args(args): preprocess.cli_main() return preprocessed_dir
def prepare_wikilarge(): print('WikiLarge') dataset = 'wikilarge' # dataset = wikismall works as well with create_directory_or_skip(get_dataset_dir(dataset)): url = 'https://github.com/louismartin/dress-data/raw/master/data-simplification.tar.bz2' extracted_path = download_and_extract(url)[0] # Process print('Processing...') # Only rename files and put them in local directory architecture # FIXME: Wikilarge validations set only has 992 sentences for phase in PHASES: for (old_language_name, new_language_name) in [('src', 'complex'), ('dst', 'simple')]: old_path_glob = os.path.join( extracted_path, dataset, f'*.ori.{phase}.{old_language_name}') globs = glob(old_path_glob) assert len(globs) == 1 old_path = globs[0] new_path = get_data_filepath(dataset, phase, new_language_name) shutil.copyfile(old_path, new_path) shutil.move(replace_lrb_rrb_file(new_path), new_path) add_newline_at_end_of_file(new_path) print('Done.')
def get_original_dataset(dataset): filepath = get_dataset_dir(dataset) / 'original_dataset' if not filepath.exists(): return None [original_dataset] = read_lines(filepath) return original_dataset
def train_roberta( dataset, sample_break_mode='complete', batch_size=8192, max_sentences=16, max_tokens=12000, tokens_per_sample=512, checkpoints_dir=None, distributed_world_size=None, sentencepiece_model_path=None, arch='roberta_base', dropout=0.1, total_updates=500000, log_interval=100, peak_lr=0.0007, clip_norm=None, no_epoch_checkpoint=False, validate_interval=1, save_interval=1, save_interval_updates=5000, warmup_updates=10000, ): preprocessed_dir = mlm_fairseq_preprocess(dataset) if checkpoints_dir is None: checkpoints_dir = get_fairseq_exp_dir() / 'checkpoints' checkpoints_dir = Path(checkpoints_dir) checkpoints_dir.mkdir(parents=True, exist_ok=True) shutil.copy( get_dataset_dir(dataset) / 'sentencepiece.bpe.model', checkpoints_dir) shutil.copy( get_dataset_dir(dataset) / 'fairseq_preprocessed/dict.txt', checkpoints_dir) effective_batch_size = max_sentences * distributed_world_size # assert batch_size % effective_batch_size == 0 update_freq = int(round(batch_size / effective_batch_size, 0)) print(f'batch_size={effective_batch_size * update_freq}') command = f''' eval "$(conda shell.bash hook)" # Needed to use conda activate in subshells conda activate bert_fr fairseq-train {preprocessed_dir} \ --save-dir {checkpoints_dir} \ --task masked_lm --criterion masked_lm \ --arch {arch} --sample-break-mode {sample_break_mode} --tokens-per-sample {tokens_per_sample} \ --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 --clip-norm 0.0 \ --lr-scheduler polynomial_decay --lr {peak_lr} --warmup-updates {warmup_updates} --total-num-update {total_updates} \ --dropout {dropout} --attention-dropout {dropout} --weight-decay 0.01 \ --max-sentences {max_sentences} --update-freq {update_freq} \ --max-update {total_updates} --log-format simple --log-interval {log_interval} \ --skip-invalid-size-inputs-valid-test \ --save-interval-updates {save_interval_updates} \ --validate-interval {validate_interval} --save-interval {save_interval} \ --tensorboard-logdir {TENSORBOARD_LOGS_DIR} --fast-stat-sync \ --fp16 --seed 1 ''' # noqa command = command.strip(' ').strip('\n') if distributed_world_size is not None: command += f' --distributed-world-size {distributed_world_size} --distributed-port 53005' # noqa if sentencepiece_model_path is not None: command += f' --bpe sentencepiece --sentencepiece-vocab {sentencepiece_model_path} --mask-whole-words' if max_tokens is not None: command += f' --max-tokens {max_tokens}' if clip_norm is not None: command += f' --clip-norm {clip_norm}' if no_epoch_checkpoint: command += ' --no-epoch-checkpoints' command = re.sub(' +', ' ', command) # Remove multiple whitespaces print(command) run_command(command)
# All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import faiss from muss.mining.preprocessing import create_base_index, get_index_name, get_sentences_paths from muss.utils.helpers import yield_lines from muss.laser import get_laser_embeddings from muss.resources.paths import get_dataset_dir # Create index language = 'en' n_train_sentences = 1000000 train_sentences = [] for sentences_path in get_sentences_paths(language='en'): for sentence in yield_lines(sentences_path): train_sentences.append(sentence) if len(train_sentences) == n_train_sentences: break if len(train_sentences) == n_train_sentences: break get_embeddings = lambda sentences: get_laser_embeddings( sentences, max_tokens=3000, language=language) # noqa: E731 output_dir = get_dataset_dir('uts') / f'base_indexes/laser_{language}' output_dir.mkdir(exist_ok=True) create_base_index(train_sentences, get_index_name(), get_embeddings, faiss.METRIC_INNER_PRODUCT, output_dir)
compute_and_save_simplification_pairs, get_index_path, compute_and_save_embeddings, get_filter_string_representation, combine_simplifications_in_dataset, get_simplification_pairs_paths, ) from muss.mining.filtering import SimplicityScorer ccnet_dir = Path( input( 'Please download the CCNet corpus from https://github.com/facebookresearch/cc_net and enter the path to the downloaded data: ' )) language = input('What language do you want to process? (en/fr/es): ') cluster = 'local' dataset_dir = get_dataset_dir('uts') / language # For large jobs only slurm_partition = 'dev,scavenge' slurm_array_parallelism = 1024 # Split CCNet shards into subshards with log_action('Splitting CCNet shards into smaller subshards'): # We need to split each shard even more for the LASER embeddings to fit in memory n_shards = { # Number of shards to take for each languages for ~1B sentences 'en': 15, 'fr': 25, 'es': 13, # We would need about 20 shards for 1B sentences, but there are only 13 }[language] ccnet_filepaths = [ ccnet_dir / f'{language}_head_{i:04d}.json.gz' for i in range(n_shards)