Exemplo n.º 1
0
def test_ConditionalLSTM_dot():
    params = load_tests_params()

    # Current test params: Single layered LSTM - ConditionalGRU
    params['BIDIRECTIONAL_ENCODER'] = True
    params['N_LAYERS_ENCODER'] = 1
    params['BIDIRECTIONAL_DEEP_ENCODER'] = True
    params['ENCODER_RNN_TYPE'] = 'LSTM'
    params['DECODER_RNN_TYPE'] = 'ConditionalLSTM'
    params['N_LAYERS_DECODER'] = 1
    params['ATTENTION_MODE'] = 'dot'

    params['REBUILD_DATASET'] = True
    dataset = build_dataset(params)
    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
    params['MODEL_NAME'] = \
        params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \
        '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \
        '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \
        '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str(
            params['ENCODER_HIDDEN_SIZE']) + \
        '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str(
            params['DECODER_HIDDEN_SIZE']) + params['ATTENTION_MODE'] + \
        '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \
        '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \
        '_' + params['OPTIMIZER'] + '_' + str(params['LR'])
    params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models', params['MODEL_NAME'])

    # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus...
    print("Training model")
    train_model(params)
    params['RELOAD'] = 1
    print("Done")

    parser = argparse.ArgumentParser('Parser for unit testing')
    parser.dataset = os.path.join(
        params['DATASET_STORE_PATH'],
        'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl')

    parser.text = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.splits = ['val']
    parser.config = params['STORE_PATH'] + '/config.pkl'
    parser.models = [params['STORE_PATH'] + '/epoch_' + str(1)]
    parser.verbose = 0
    parser.dest = None
    parser.source = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.target = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['TRG_LAN'])
    parser.weights = []
    parser.glossary = None

    for n_best in [True, False]:
        parser.n_best = n_best
        print("Sampling with n_best = %s " % str(n_best))
        sample_ensemble(parser, params)
        print("Done")

    print("Scoring corpus")
    score_corpus(parser, params)
    print("Done")
def test_transformer():
    params = load_tests_params()

    # Current test params: Transformer
    params['MODEL_TYPE'] = 'Transformer'
    params['TIED_EMBEDDINGS'] = True
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['MULTIHEAD_ATTENTION_ACTIVATION'] = 'relu'
    params['MODEL_SIZE'] = 8
    params['FF_SIZE'] = params['MODEL_SIZE'] * 4
    params['N_HEADS'] = 2
    params['REBUILD_DATASET'] = True
    params['OPTIMIZED_SEARCH'] = False
    params['POS_UNK'] = False
    dataset = build_dataset(params)
    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]

    params['MODEL_NAME'] = \
        params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \
        '_model_size_' + str(params['MODEL_SIZE']) + \
        '_ff_size_' + str(params['FF_SIZE']) + \
        '_num_heads_' + str(params['N_HEADS']) + \
        '_encoder_blocks_' + str(params['N_LAYERS_ENCODER']) + \
        '_decoder_blocks_' + str(params['N_LAYERS_DECODER']) + \
        '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \
        '_' + params['OPTIMIZER'] + '_' + str(params['LR'])

    params['STORE_PATH'] = K.backend() + '_test_train_models/' + params['MODEL_NAME'] + '/'

    # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus...
    print ("Training model")
    train_model(params)
    params['RELOAD'] = 1
    print ("Done")

    parser = argparse.ArgumentParser('Parser for unit testing')
    parser.dataset = params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl'

    parser.text = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES']['val'] + params['SRC_LAN']
    parser.splits = ['val']
    parser.config = params['STORE_PATH'] + '/config.pkl'
    parser.models = [params['STORE_PATH'] + '/epoch_' + str(1)]
    parser.verbose = 0
    parser.dest = None
    parser.source = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES']['val'] + params['SRC_LAN']
    parser.target = params['DATA_ROOT_PATH'] + '/' + params['TEXT_FILES']['val'] + params['TRG_LAN']
    parser.weights = []
    parser.glossary = None

    for n_best in [True, False]:
        parser.n_best = n_best
        print ("Sampling with n_best = %s " % str(n_best))
        sample_ensemble(parser, params)
        print ("Done")

    print ("Scoring corpus")
    score_corpus(parser, params)
    print ("Done")
Exemplo n.º 3
0
def test_unk_replace_1():
    params = load_tests_params()

    params['REBUILD_DATASET'] = True
    params['INPUT_VOCABULARY_SIZE'] = 0
    params['OUTPUT_VOCABULARY_SIZE'] = 50

    params['POS_UNK'] = True
    params['HEURISTIC'] = 1
    params['ALIGN_FROM_RAW'] = True

    dataset = build_dataset(params)
    # params['MAPPING'] = DATA_ROOT_PATH + '/mapping.%s_%s.pkl' % (SRC_LAN, TRG_LAN)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
    params['MODEL_NAME'] = \
        params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \
        '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \
        '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \
        '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str(
            params['ENCODER_HIDDEN_SIZE']) + \
        '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str(
            params['DECODER_HIDDEN_SIZE']) + \
        '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \
        '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \
        '_' + params['OPTIMIZER'] + '_' + str(params['LR'])
    params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models', params['MODEL_NAME'])

    # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus...
    print("Training model")
    train_model(params)
    params['RELOAD'] = 1
    print("Done")

    parser = argparse.ArgumentParser('Parser for unit testing')
    parser.dataset = os.path.join(
        params['DATASET_STORE_PATH'],
        'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl')

    parser.text = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.splits = ['val']
    parser.config = os.path.join(params['STORE_PATH'], 'config.pkl')
    parser.models = [os.path.join(params['STORE_PATH'], 'epoch_' + str(1))]
    parser.verbose = 0
    parser.dest = None
    parser.source = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['SRC_LAN'])
    parser.target = os.path.join(params['DATA_ROOT_PATH'], params['TEXT_FILES']['val'] + params['TRG_LAN'])
    parser.weights = []
    parser.glossary = None

    for n_best in [True, False]:
        parser.n_best = n_best
        print("Sampling with n_best = %s " % str(n_best))
        sample_ensemble(parser, params)
        print("Done")

    print("Scoring corpus")
    score_corpus(parser, params)
    print("Done")
Exemplo n.º 4
0
def resume_training(latest_epoch, use_gpu):

    params = load_parameters()
    params['MODEL_TYPE'] = 'AttentionRNNEncoderDecoder'
    params['USE_CUDNN'] = use_gpu
    params['N_GPUS'] = 2
    params['MAX_EPOCH'] = latest_epoch + 1000
    params['BATCH_SIZE'] = 128
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = PATH + "model/"
    params['ATTENTION_MODE'] = "add"
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 512
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 512
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 512
    params['ATTENTION_SIZE'] = 512
    params['ENCODER_HIDDEN_SIZE'] = 512
    params['DECODER_HIDDEN_SIZE'] = 512
    params['ENCODER_RNN_TYPE'] = "LSTM"
    params['DECODER_RNN_TYPE'] = "ConditionalLSTM"
    params['METRICS'] = ['coco']
    params['KERAS_METRICS'] = ['perplexity']
    params['APPLY_DETOKENIZATION'] = True
    params['LENGTH_PENALTY'] = True
    params['LENGTH_NORM_FACTOR'] = 1.0
    params['RELOAD'] = latest_epoch
    params['BEAM_SIZE'] = 1
    params['BEAM_SEARCH'] = True
    params['PLOT_EVALUATION'] = True
    params['MAX_PLOT_Y'] = 1.
    params['MODE'] = 'training'
    params['TENSORBOARD'] = True

    result = pyfiglet.figlet_format("RESUME TRAINING".format(mode),
                                    font="digital")
    print(result)
    train_model(params,
                load_dataset=os.getcwd() +
                "/dataset/Dataset_tutorial_dataset.pkl")
Exemplo n.º 5
0
def test_sampling_maxlikelihood():
    params = load_tests_params()

    params['REBUILD_DATASET'] = True
    params['INPUT_VOCABULARY_SIZE'] = 550
    params['OUTPUT_VOCABULARY_SIZE'] = 550

    params['POS_UNK'] = True
    params['HEURISTIC'] = 0
    params['ALIGN_FROM_RAW'] = True

    # Sampling params: Show some samples during training.
    params['SAMPLE_ON_SETS'] = ['train', 'val']
    params['N_SAMPLES'] = 10
    params['START_SAMPLING_ON_EPOCH'] = 0
    params['SAMPLE_EACH_UPDATES'] = 50
    params['SAMPLING'] = 'max_likelihood'

    dataset = build_dataset(params)
    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]
    params['MODEL_NAME'] = \
        params['TASK_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '_' + params['MODEL_TYPE'] + \
        '_src_emb_' + str(params['SOURCE_TEXT_EMBEDDING_SIZE']) + \
        '_bidir_' + str(params['BIDIRECTIONAL_ENCODER']) + \
        '_enc_' + params['ENCODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_ENCODER']) + '_' + str(
            params['ENCODER_HIDDEN_SIZE']) + \
        '_dec_' + params['DECODER_RNN_TYPE'] + '_*' + str(params['N_LAYERS_DECODER']) + '_' + str(
            params['DECODER_HIDDEN_SIZE']) + \
        '_deepout_' + '_'.join([layer[0] for layer in params['DEEP_OUTPUT_LAYERS']]) + \
        '_trg_emb_' + str(params['TARGET_TEXT_EMBEDDING_SIZE']) + \
        '_' + params['OPTIMIZER'] + '_' + str(params['LR'])
    params['STORE_PATH'] = os.path.join(K.backend() + '_test_train_models',
                                        params['MODEL_NAME'])

    # Test several NMT-Keras utilities: train, sample, sample_ensemble, score_corpus...
    print("Training model")
    train_model(params)
    print("Done")
def resume_training(latest_epoch):
    params = load_parameters()
    params['RELOAD'] = latest_epoch
    params['MODEL_TYPE'] = 'Transformer'
    params['USE_CUDNN'] = use_gpu
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = MODEL_PATH
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['N_HEADS'] = 100
    params['POS_UNK'] = False  # current Transformer model requires this
    params[
        'ATTEND_ON_OUTPUT'] = True  # current Transformer model requires this
    params['MODEL_SIZE'] = 100
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 100
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 100
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 100
    params['ENCODER_HIDDEN_SIZE'] = 100
    params['DECODER_HIDDEN_SIZE'] = 100
    params['APPLY_DETOKENIZATION'] = True
    params['LENGTH_PENALTY'] = True
    params['LENGTH_NORM_FACTOR'] = 0.8
    params['MAX_INPUT_TEXT_LEN'] = 128
    params['MAX_OUTPUT_TEXT_LEN'] = 128
    params['STOP_METRIC'] = 'perplexity'
    params['BEAM_SIZE'] = 20
    params['N_GPUS'] = 2
    params['START_EVAL_ON_EPOCH'] = 1
    params['BATCH_SIZE'] = 128
    params['EVAL_EACH'] = 1
    params['MAX_EPOCH'] = 100
    params['PLOT_EVALULATION'] = True
    params['APPLY_DETOKENIZATION'] = True
    params['MODE'] = 'training'
    params['BEAM_SEARCH'] = True
    params['TENSORBOARD'] = True
    train_model(params,
                load_dataset=MODEL_PATH +
                "/dataset/Dataset_tutorial_dataset.pkl")
Exemplo n.º 7
0
def resume_training(latest_epoch):
    params = load_parameters()
    params['RELOAD'] = latest_epoch
    params['MODEL_TYPE'] = 'AttentionRNNEncoderDecoder'
    params['USE_CUDNN'] = use_gpu
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = MODEL_PATH
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 32
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 32
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 32
    params['ATTENTION_SIZE'] = 32
    params['ENCODER_HIDDEN_SIZE'] = 32
    params['DECODER_HIDDEN_SIZE'] = 32
    params['N_LAYERS_ENCODER'] = 4
    params['N_LAYERS_DECODER'] = 4
    params['APPLY_DETOKENIZATION'] = True
    params['MAX_INPUT_TEXT_LEN'] = 24
    params['MAX_OUTPUT_TEXT_LEN'] = 24
    params['STOP_METRIC'] = 'perplexity'
    params['POS_UNK'] = True
    params['BEAM_SIZE'] = 20
    params['N_GPUS'] = 2
    params['START_EVAL_ON_EPOCH'] = 1
    params['BATCH_SIZE'] = 256
    params['EVAL_EACH'] = 1
    params['MAX_EPOCH'] = 300
    params['PLOT_EVALULATION'] = True
    params['APPLY_DETOKENIZATION'] = True
    params['MODE'] = 'training'
    params['BEAM_SEARCH'] = True
    params['TENSORBOARD'] = True
    params['LR'] = 0.1
    train_model(params,
                load_dataset=MODEL_PATH +
                "/dataset/Dataset_tutorial_dataset.pkl")
Exemplo n.º 8
0
    parameters = load_parameters()
    if args.config is not None:
        parameters = update_parameters(parameters, pkl2dict(args.config))
    try:
        for arg in args.changes:
            try:
                k, v = arg.split('=')
            except ValueError:
                print(
                    'Overwritten arguments must have the form key=Value. \n Currently are: %s'
                    % str(args.changes))
                exit(1)
            try:
                parameters[k] = ast.literal_eval(v)
            except ValueError:
                parameters[k] = v
    except ValueError:
        print('Error processing arguments: (', k, ",", v, ")")
        exit(2)

    parameters = check_params(parameters)
    if parameters['MODE'] == 'training':
        logger.info('Running training.')
        train_model(parameters, args.dataset)
    elif parameters['MODE'] == 'sampling':
        logger.error(
            'Depecrated function. For sampling from a trained model, please run sample_ensemble.py.'
        )
        exit(2)
    logger.info('Done!')
def start_training(use_gpu):

    ds = Dataset('tutorial_dataset', 'tutorial', silence=False)
    ds.setOutput(DATA_PATH + "train_y.txt",
                 'train',
                 type='text',
                 id='target_text',
                 tokenization='tokenize_basic',
                 build_vocabulary=True,
                 pad_on_batch=True,
                 sample_weights=True,
                 max_text_len=30,
                 max_words=30000,
                 min_occ=0)

    ds.setOutput(DATA_PATH + "val_y.txt",
                 'val',
                 type='text',
                 id='target_text',
                 pad_on_batch=True,
                 tokenization='tokenize_basic',
                 sample_weights=True,
                 max_text_len=30,
                 max_words=0)

    ds.setInput(DATA_PATH + "train_x.txt",
                'train',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                build_vocabulary=True,
                fill='end',
                max_text_len=30,
                max_words=30000,
                min_occ=0)

    ds.setInput(DATA_PATH + "val_x.txt",
                'val',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                fill='end',
                max_text_len=30,
                min_occ=0)

    ds.setInput(DATA_PATH + "train_y.txt",
                'train',
                type='text',
                id='state_below',
                required=False,
                tokenization='tokenize_basic',
                pad_on_batch=True,
                build_vocabulary='target_text',
                offset=1,
                fill='end',
                max_text_len=30,
                max_words=30000)

    ds.setInput(None, 'val', type='ghost', id='state_below', required=False)

    for split, input_text_filename in zip(
        ['train', 'val'],
        [DATA_PATH + "train_x.txt", DATA_PATH + "val_x.txt"]):
        ds.setRawInput(input_text_filename,
                       split,
                       type='file-name',
                       id='raw_source_text',
                       overwrite_split=True)
    """We also need to match the references with the inputs. Since we only have one reference per input sample, we set `repeat=1`."""

    keep_n_captions(ds, repeat=1, n=1, set_names=['val'])
    """Finally, we can save our dataset instance for using in other experiments:"""

    saveDataset(ds, MODEL_PATH + "/dataset")
    """## 2. Creating and training a Neural Translation Model
    Now, we'll create and train a Neural Machine Translation (NMT) model. Since there is a significant number of hyperparameters, we'll use the default ones, specified in the `config.py` file. Note that almost every hardcoded parameter is automatically set from config if we run  `main.py `.

    We'll create an `'AttentionRNNEncoderDecoder'` (a LSTM encoder-decoder with attention mechanism). Refer to the [`model_zoo.py`](https://github.com/lvapeab/nmt-keras/blob/master/nmt_keras/model_zoo.py) file for other models (e.g. Transformer). 

    So first, let's import the model and the hyperparameters. We'll also load the dataset we stored in the previous section (not necessary as it is in memory, but as a demonstration):
    """

    params = load_parameters()
    dataset = loadDataset(MODEL_PATH + "/dataset/Dataset_tutorial_dataset.pkl")
    """Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance:"""

    params['MODEL_TYPE'] = 'Transformer'
    params['USE_CUDNN'] = use_gpu
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = MODEL_PATH
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['N_HEADS'] = 100
    params['POS_UNK'] = False  # current Transformer model requires this
    params[
        'ATTEND_ON_OUTPUT'] = True  # current Transformer model requires this
    params['MODEL_SIZE'] = 100
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 100
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 100
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 100
    params['ENCODER_HIDDEN_SIZE'] = 100
    params['DECODER_HIDDEN_SIZE'] = 100
    params['APPLY_DETOKENIZATION'] = True
    params['LENGTH_PENALTY'] = True
    params['LENGTH_NORM_FACTOR'] = 0.8
    params['MAX_INPUT_TEXT_LEN'] = 128
    params['MAX_OUTPUT_TEXT_LEN'] = 128
    params['STOP_METRIC'] = 'perplexity'
    params['BEAM_SIZE'] = 20
    params['N_GPUS'] = 2
    params['START_EVAL_ON_EPOCH'] = 1
    params['BATCH_SIZE'] = 128
    params['EVAL_EACH'] = 1
    params['MAX_EPOCH'] = 100
    params['PLOT_EVALULATION'] = True
    params['APPLY_DETOKENIZATION'] = True
    params['MODE'] = 'training'
    params['BEAM_SEARCH'] = True
    params['TENSORBOARD'] = True
    train_model(params,
                load_dataset=MODEL_PATH +
                "/dataset/Dataset_tutorial_dataset.pkl")
Exemplo n.º 10
0
def start_training(use_gpu):

    ds = Dataset('tutorial_dataset', 'tutorial', silence=False)
    ds.setOutput(PATH + "train_correct.txt",
                 'train',
                 type='text',
                 id='target_text',
                 tokenization='tokenize_basic',
                 build_vocabulary=True,
                 pad_on_batch=True,
                 sample_weights=True,
                 max_text_len=100,
                 max_words=55000,
                 min_occ=1)

    ds.setOutput(PATH + "validation_correct.txt",
                 'val',
                 type='text',
                 id='target_text',
                 pad_on_batch=True,
                 tokenization='tokenize_basic',
                 sample_weights=True,
                 max_text_len=100,
                 max_words=0)

    ds.setInput(PATH + "train_error.txt",
                'train',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                build_vocabulary=True,
                fill='end',
                max_text_len=100,
                max_words=55000,
                min_occ=1)

    ds.setInput(PATH + "validation_error.txt",
                'val',
                type='text',
                id='source_text',
                pad_on_batch=True,
                tokenization='tokenize_basic',
                fill='end',
                max_text_len=100,
                min_occ=1)
    """...and for the 'state_below' data. Note that: 1) The offset flat is set to 1, which means that the text will be shifted to the right 1 position. 2) During sampling time, we won't have this input. Hence, we 'hack' the dataset model by inserting an artificial input, of type 'ghost' for the validation split."""

    ds.setInput(PATH + "train_correct.txt",
                'train',
                type='text',
                id='state_below',
                required=False,
                tokenization='tokenize_basic',
                pad_on_batch=True,
                build_vocabulary='target_text',
                offset=1,
                fill='end',
                max_text_len=100,
                max_words=55000)
    ds.setInput(None, 'val', type='ghost', id='state_below', required=False)
    """We can also keep the literal source words (for replacing unknown words)."""

    for split, input_text_filename in zip(
        ['train', 'val'],
        [PATH + "train_error.txt", PATH + "validation_error.txt"]):
        ds.setRawInput(input_text_filename,
                       split,
                       type='file-name',
                       id='raw_source_text',
                       overwrite_split=True)
    """We also need to match the references with the inputs. Since we only have one reference per input sample, we set `repeat=1`."""

    keep_n_captions(ds, repeat=1, n=1, set_names=['val'])
    """Finally, we can save our dataset instance for using in other experiments:"""

    saveDataset(ds, PATH + "dataset")
    """## 2. Creating and training a Neural Translation Model
    Now, we'll create and train a Neural Machine Translation (NMT) model. Since there is a significant number of hyperparameters, we'll use the default ones, specified in the `config.py` file. Note that almost every hardcoded parameter is automatically set from config if we run  `main.py `.

    We'll create an `'AttentionRNNEncoderDecoder'` (a LSTM encoder-decoder with attention mechanism). Refer to the [`model_zoo.py`](https://github.com/lvapeab/nmt-keras/blob/master/nmt_keras/model_zoo.py) file for other models (e.g. Transformer). 

    So first, let's import the model and the hyperparameters. We'll also load the dataset we stored in the previous section (not necessary as it is in memory, but as a demonstration):
    """

    params = load_parameters()
    dataset = loadDataset(PATH + "dataset/Dataset_tutorial_dataset.pkl")
    """Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance:"""

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['source_text']
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['target_text']
    params['USE_CUDNN'] = use_gpu
    params['N_GPUS'] = 2
    params['MAX_EPOCH'] = 1000
    params['EARLY_STOP'] = True
    params['PATIENCE'] = 10
    params['SAVE_EACH_EVALUATION'] = True
    params['STORE_PATH'] = PATH + "model/"
    params['BATCH_SIZE'] = 128
    params['ATTENTION_MODE'] = "add"
    params['N_LAYERS_ENCODER'] = 2
    params['N_LAYERS_DECODER'] = 2
    params['SOURCE_TEXT_EMBEDDING_SIZE'] = 512
    params['TARGET_TEXT_EMBEDDING_SIZE'] = 512
    params['SKIP_VECTORS_HIDDEN_SIZE'] = 512
    params['ATTENTION_SIZE'] = 512
    params['ENCODER_HIDDEN_SIZE'] = 512
    params['DECODER_HIDDEN_SIZE'] = 512
    params['ENCODER_RNN_TYPE'] = "LSTM"
    params['DECODER_RNN_TYPE'] = "ConditionalLSTM"
    params['METRICS'] = ['coco']
    params['KERAS_METRICS'] = ['perplexity']
    params['APPLY_DETOKENIZATION'] = True
    params['LENGTH_PENALTY'] = True
    params['LENGTH_NORM_FACTOR'] = 1.0
    params['BEAM_SIZE'] = 1
    params['BEAM_SEARCH'] = True
    params['PLOT_EVALUATION'] = True
    params['MAX_PLOT_Y'] = 1.
    params['MODE'] = 'training'
    params['TENSORBOARD'] = True

    result = pyfiglet.figlet_format("START TRAINING FROM SCRATCH".format(mode),
                                    font="digital")
    print(result)
    train_model(params,
                load_dataset=os.getcwd() +
                "/dataset/Dataset_tutorial_dataset.pkl")