def test_build_train_datasets_valid(data_opts_17): datasets = build_training_datasets( fieldset=build_fieldset(), **vars(data_opts_17) ) assert len(datasets) == 2 for dataset in datasets: assert type(dataset) == QEDataset
def test_extend_vocabs(extend_vocab): options = extend_vocab OOV_SRC = 'oov_word_src' OOV_TGT = 'oov_word_tgt' fieldset = Predictor.fieldset(wmt18_format=options.wmt18_format) vocabs_fieldset = extend_vocabs_fieldset.build_fieldset(fieldset) dataset, _ = build_training_datasets( fieldset, extend_vocabs_fieldset=vocabs_fieldset, **vars(options)) assert OOV_SRC in dataset.fields[constants.SOURCE].vocab.stoi assert OOV_TGT in dataset.fields[constants.TARGET].vocab.stoi fieldset = Predictor.fieldset(wmt18_format=options.wmt18_format) options.extend_source_vocab = None options.extend_target_vocab = None dataset, _ = build_training_datasets(fieldset, **vars(options)) assert OOV_SRC not in dataset.fields[constants.SOURCE].vocab.stoi assert OOV_TGT not in dataset.fields[constants.TARGET].vocab.stoi
def test_build_train_datasets_no_valid(data_opts_no_validation, atol): datasets = build_training_datasets( fieldset=build_fieldset(), **vars(data_opts_no_validation) ) assert len(datasets) == 2 for dataset in datasets: assert type(dataset) == QEDataset train_size, dev_size = len(datasets[0]), len(datasets[1]) np.testing.assert_allclose( train_size / (train_size + dev_size), data_opts_no_validation.split, atol=atol, )
def check_qe_dataset(options): train_dataset, dev_dataset = build_training_datasets( fieldset=build_fieldset(), **vars(options) ) train_iter = build_bucket_iterator( train_dataset, batch_size=8, is_train=True, device=None ) dev_iter = build_bucket_iterator( dev_dataset, batch_size=8, is_train=False, device=None ) for batch_train, batch_dev in zip(train_iter, dev_iter): train_source = getattr(batch_train, constants.SOURCE) if isinstance(train_source, tuple): train_source, lenghts = train_source train_source.t() train_prev_len = train_source.shape[1] # buckets should be sorted in decreasing length order # so we can use pack/padded sequences for train_sample in train_source: train_mask = train_sample != constants.PAD_ID train_cur_len = train_mask.int().sum().item() assert train_cur_len <= train_prev_len train_prev_len = train_cur_len source_field = train_dataset.fields[constants.SOURCE] target_field = train_dataset.fields[constants.TARGET] target_tags_field = train_dataset.fields[constants.TARGET_TAGS] # check if each token is in the vocab for train_sample, dev_sample in zip(train_dataset, dev_dataset): for word in getattr(train_sample, constants.SOURCE): assert word in source_field.vocab.stoi for word in getattr(train_sample, constants.TARGET): assert word in target_field.vocab.stoi for tag in getattr(train_sample, constants.TARGET_TAGS): assert tag in target_tags_field.vocab.stoi for word in getattr(dev_sample, constants.SOURCE): assert word in source_field.vocab.stoi for word in getattr(dev_sample, constants.TARGET): assert word in target_field.vocab.stoi for tag in getattr(dev_sample, constants.TARGET_TAGS): assert tag in target_tags_field.vocab.stoi
def retrieve_datasets(fieldset, pipeline_options, model_options, output_dir): """ Creates `Dataset` objects for the training and validation sets. Parses files according to pipeline and model options. Args: fieldset pipeline_options (Namespace): Generic training options load_data (str): Input directory for loading preprocessed data files. load_model (str): Directory containing model.torch for loading pre-created model. resume (boolean): Indicates if you should resume training from a previous run. load_vocab (str): Directory containing vocab.torch file to be loaded. model_options (Namespace): Model specific options. output_dir (str): Path to directory where experiment files should be saved. Returns: datasets (Dataset): Training and validation datasets """ if pipeline_options.load_data: datasets = utils.load_training_datasets(pipeline_options.load_data, fieldset) else: load_vocab = None if pipeline_options.resume: load_vocab = Path(output_dir, const.VOCAB_FILE) elif pipeline_options.load_model: load_vocab = pipeline_options.load_model elif model_options.__dict__.get("load_pred_source"): load_vocab = model_options.load_pred_source elif model_options.__dict__.get("load_pred_target"): load_vocab = model_options.load_pred_target elif pipeline_options.load_vocab: load_vocab = pipeline_options.load_vocab datasets = builders.build_training_datasets(fieldset, load_vocab=load_vocab, **vars(model_options)) return datasets