예제 #1
0
def test_build_train_datasets_valid(data_opts_17):
    datasets = build_training_datasets(
        fieldset=build_fieldset(), **vars(data_opts_17)
    )
    assert len(datasets) == 2
    for dataset in datasets:
        assert type(dataset) == QEDataset
예제 #2
0
def test_extend_vocabs(extend_vocab):
    options = extend_vocab
    OOV_SRC = 'oov_word_src'
    OOV_TGT = 'oov_word_tgt'

    fieldset = Predictor.fieldset(wmt18_format=options.wmt18_format)
    vocabs_fieldset = extend_vocabs_fieldset.build_fieldset(fieldset)

    dataset, _ = build_training_datasets(
        fieldset, extend_vocabs_fieldset=vocabs_fieldset, **vars(options))
    assert OOV_SRC in dataset.fields[constants.SOURCE].vocab.stoi
    assert OOV_TGT in dataset.fields[constants.TARGET].vocab.stoi

    fieldset = Predictor.fieldset(wmt18_format=options.wmt18_format)
    options.extend_source_vocab = None
    options.extend_target_vocab = None
    dataset, _ = build_training_datasets(fieldset, **vars(options))
    assert OOV_SRC not in dataset.fields[constants.SOURCE].vocab.stoi
    assert OOV_TGT not in dataset.fields[constants.TARGET].vocab.stoi
예제 #3
0
def test_build_train_datasets_no_valid(data_opts_no_validation, atol):
    datasets = build_training_datasets(
        fieldset=build_fieldset(), **vars(data_opts_no_validation)
    )
    assert len(datasets) == 2
    for dataset in datasets:
        assert type(dataset) == QEDataset
    train_size, dev_size = len(datasets[0]), len(datasets[1])
    np.testing.assert_allclose(
        train_size / (train_size + dev_size),
        data_opts_no_validation.split,
        atol=atol,
    )
def check_qe_dataset(options):
    train_dataset, dev_dataset = build_training_datasets(
        fieldset=build_fieldset(), **vars(options)
    )

    train_iter = build_bucket_iterator(
        train_dataset, batch_size=8, is_train=True, device=None
    )

    dev_iter = build_bucket_iterator(
        dev_dataset, batch_size=8, is_train=False, device=None
    )

    for batch_train, batch_dev in zip(train_iter, dev_iter):
        train_source = getattr(batch_train, constants.SOURCE)
        if isinstance(train_source, tuple):
            train_source, lenghts = train_source
        train_source.t()
        train_prev_len = train_source.shape[1]
        # buckets should be sorted in decreasing length order
        # so we can use pack/padded sequences
        for train_sample in train_source:
            train_mask = train_sample != constants.PAD_ID
            train_cur_len = train_mask.int().sum().item()
            assert train_cur_len <= train_prev_len
            train_prev_len = train_cur_len

    source_field = train_dataset.fields[constants.SOURCE]
    target_field = train_dataset.fields[constants.TARGET]
    target_tags_field = train_dataset.fields[constants.TARGET_TAGS]

    # check if each token is in the vocab
    for train_sample, dev_sample in zip(train_dataset, dev_dataset):
        for word in getattr(train_sample, constants.SOURCE):
            assert word in source_field.vocab.stoi
        for word in getattr(train_sample, constants.TARGET):
            assert word in target_field.vocab.stoi
        for tag in getattr(train_sample, constants.TARGET_TAGS):
            assert tag in target_tags_field.vocab.stoi

        for word in getattr(dev_sample, constants.SOURCE):
            assert word in source_field.vocab.stoi
        for word in getattr(dev_sample, constants.TARGET):
            assert word in target_field.vocab.stoi
        for tag in getattr(dev_sample, constants.TARGET_TAGS):
            assert tag in target_tags_field.vocab.stoi
예제 #5
0
def retrieve_datasets(fieldset, pipeline_options, model_options, output_dir):
    """
    Creates `Dataset` objects for the training and validation sets.

    Parses files according to pipeline and model options.

    Args:
        fieldset
        pipeline_options (Namespace): Generic training options
            load_data (str): Input directory for loading preprocessed data
                files.
            load_model (str): Directory containing model.torch for loading
                pre-created model.
            resume (boolean): Indicates if you should resume training from a
                previous run.
            load_vocab (str): Directory containing vocab.torch file to be
                loaded.
        model_options (Namespace): Model specific options.
        output_dir (str): Path to directory where experiment files should be
            saved.

    Returns:
        datasets (Dataset): Training and validation datasets
    """
    if pipeline_options.load_data:
        datasets = utils.load_training_datasets(pipeline_options.load_data,
                                                fieldset)
    else:
        load_vocab = None

        if pipeline_options.resume:
            load_vocab = Path(output_dir, const.VOCAB_FILE)
        elif pipeline_options.load_model:
            load_vocab = pipeline_options.load_model
        elif model_options.__dict__.get("load_pred_source"):
            load_vocab = model_options.load_pred_source
        elif model_options.__dict__.get("load_pred_target"):
            load_vocab = model_options.load_pred_target
        elif pipeline_options.load_vocab:
            load_vocab = pipeline_options.load_vocab

        datasets = builders.build_training_datasets(fieldset,
                                                    load_vocab=load_vocab,
                                                    **vars(model_options))
    return datasets