Пример #1
0
def test_ConditionalLSTM_dot():
    params = load_tests_params()

    # Current test params: Single layered LSTM - ConditionalGRU
    params['BIDIRECTIONAL_ENCODER'] = True
    params['N_LAYERS_ENCODER'] = 1
    params['BIDIRECTIONAL_DEEP_ENCODER'] = True
    params['ENCODER_RNN_TYPE'] = 'LSTM'
    params['DECODER_RNN_TYPE'] = 'ConditionalLSTM'
    params['N_LAYERS_DECODER'] = 1
    params['ATTENTION_MODE'] = 'dot'

    params['REBUILD_DATASET'] = True
    dataset = build_dataset(params)
    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
    params['MODEL_NAME'] = \
        params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \
        '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \
        '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \
        '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str(
            params['ENCODER_HIDDEN_SIZE']) + \
        '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str(
            params['DECODER_HIDDEN_SIZE']) + params['ATTENTION_MODE'] + \
        '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \
        '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \
        '_' + params['OPTIMIZER'] + '_' + str(params['LR'])
    params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models', params['MODEL_NAME'])

    # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus...
    print("Training model")
    train_model(params)
    params['RELOAD'] = 1
    print("Done")

    parser = argparse.ArgumentParser('Parser for unit testing')
    parser.dataset = os.path.join(
        params['DATASET_STORE_PATH'],
        'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl')

    parser.text = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.splits = ['val']
    parser.config = params['STORE_PATH'] + '/config.pkl'
    parser.models = [params['STORE_PATH'] + '/epoch_' + str(1)]
    parser.verbose = 0
    parser.dest = None
    parser.source = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.target = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['TRG_LAN'])
    parser.weights = []
    parser.glossary = None

    for n_best in [True, False]:
        parser.n_best = n_best
        print("Sampling with n_best = %s " % str(n_best))
        sample_ensemble(parser, params)
        print("Done")

    print("Scoring corpus")
    score_corpus(parser, params)
    print("Done")
Пример #2
0
def test_unk_replace_1():
    params = load_tests_params()

    params['REBUILD_DATASET'] = True
    params['INPUT_VOCABULARY_SIZE'] = 0
    params['OUTPUT_VOCABULARY_SIZE'] = 50

    params['POS_UNK'] = True
    params['HEURISTIC'] = 1
    params['ALIGN_FROM_RAW'] = True

    dataset = build_dataset(params)
    # params['MAPPING'] = DATA_ROOT_PATH + '/mapping.%s_%s.pkl' % (SRC_LAN, TRG_LAN)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
    params['MODEL_NAME'] = \
        params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \
        '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \
        '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \
        '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str(
            params['ENCODER_HIDDEN_SIZE']) + \
        '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str(
            params['DECODER_HIDDEN_SIZE']) + \
        '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \
        '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \
        '_' + params['OPTIMIZER'] + '_' + str(params['LR'])
    params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models', params['MODEL_NAME'])

    # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus...
    print("Training model")
    train_model(params)
    params['RELOAD'] = 1
    print("Done")

    parser = argparse.ArgumentParser('Parser for unit testing')
    parser.dataset = os.path.join(
        params['DATASET_STORE_PATH'],
        'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl')

    parser.text = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.splits = ['val']
    parser.config = os.path.join(params['STORE_PATH'], 'config.pkl')
    parser.models = [os.path.join(params['STORE_PATH'], 'epoch_' + str(1))]
    parser.verbose = 0
    parser.dest = None
    parser.source = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.target = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['TRG_LAN'])
    parser.weights = []
    parser.glossary = None

    for n_best in [True, False]:
        parser.n_best = n_best
        print("Sampling with n_best = %s " % str(n_best))
        sample_ensemble(parser, params)
        print("Done")

    print("Scoring corpus")
    score_corpus(parser, params)
    print("Done")
Пример #3
0
def load_transformer_test_params():
    params = load_tests_params()
    params['MODEL_TYPE'] = 'Transformer'
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['MULTIHEAD_ATTENTION_ACTIVATION'] = 'relu'
    params['MODEL_SIZE'] = 8
    params['FF_SIZE'] = params['MODEL_SIZE'] * 4
    params['N_HEADS'] = 2
    params['REBUILD_DATASET'] = True
    params['OPTIMIZED_SEARCH'] = True
    params['POS_UNK'] = False
    return params
Пример #4
0
def test_sampling_maxlikelihood():
    params = load_tests_params()

    params['REBUILD_DATASET'] = True
    params['INPUT_VOCABULARY_SIZE'] = 550
    params['OUTPUT_VOCABULARY_SIZE'] = 550

    params['POS_UNK'] = True
    params['HEURISTIC'] = 0
    params['ALIGN_FROM_RAW'] = True

    # Sampling params: Show some samples during training.
    params['SAMPLE_ON_SETS'] = ['train', 'val']
    params['N_SAMPLES'] = 10
    params['START_SAMPLING_ON_EPOCH'] = 0
    params['SAMPLE_EACH_UPDATES'] = 50
    params['SAMPLING'] = 'max_likelihood'

    dataset = build_dataset(params)
    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]
    params['MODEL_NAME'] = \
        params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \
        '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \
        '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \
        '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str(
            params['ENCODER_HIDDEN_SIZE']) + \
        '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str(
            params['DECODER_HIDDEN_SIZE']) + \
        '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \
        '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \
        '_' + params['OPTIMIZER'] + '_' + str(params['LR'])
    params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models',
                                        params['MODEL_NAME'])

    # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus...
    print("Training model")
    train_model(params)
    print("Done")
def test_transformer():
    params = load_tests_params()

    # Current test params: Transformer
    params['MODEL_TYPE'] = 'Transformer'
    params['TIED_EMBEDDINGS'] = True
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['MULTIHEAD_ATTENTION_ACTIVATION'] = 'relu'
    params['MODEL_SIZE'] = 8
    params['FF_SIZE'] = params['MODEL_SIZE'] * 4
    params['N_HEADS'] = 2
    params['REBUILD_DATASET'] = True
    params['OPTIMIZED_SEARCH'] = True
    params['POS_UNK'] = False
    dataset = build_dataset(params)
    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]

    params['MODEL_NAME'] = \
        params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \
        '_model_size_' + str(params['MODEL_SIZE']) + \
        '_ff_size_' + str(params['FF_SIZE']) + \
        '_num_heads_' + str(params['N_HEADS']) + \
        '_encoder_blocks_' + str(params['N_LAYERS_ENCODER']) + \
        '_decoder_blocks_' + str(params['N_LAYERS_DECODER']) + \
        '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \
        '_' + params['OPTIMIZER'] + '_' + str(params['LR'])

    # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus...
    print("Training model")
    train_model(params)
    params['RELOAD'] = 1
    print("Done")

    parser = argparse.ArgumentParser('Parser for unit testing')
    parser.dataset = os.path.join(
        params['DATASET_STORE_PATH'], 'Dataset_' + params['DATASET_NAME'] +
        '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl')

    parser.text = os.path.join(params['DATA_ROOT_PATH'],
                               params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.splits = ['val']
    parser.config = params['STORE_PATH'] + '/config.pkl'
    parser.models = [params['STORE_PATH'] + '/epoch_' + str(1)]
    parser.verbose = 0
    parser.dest = None
    parser.source = os.path.join(
        params['DATA_ROOT_PATH'],
        params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.target = os.path.join(
        params['DATA_ROOT_PATH'],
        params['TEXT_FILES']['val'] + params['TRG_LAN'])
    parser.weights = []
    parser.glossary = None

    for n_best in [True, False]:
        parser.n_best = n_best
        print("Sampling with n_best = %s " % str(n_best))
        sample_ensemble(parser, params)
        print("Done")

    print("Scoring corpus")
    score_corpus(parser, params)
    print("Done")
    clean_dirs(params)