def test_keep_n_captions(): params = load_parameters() params['REBUILD_DATASET'] = True params['DATASET_STORE_PATH'] = './' ds = build_dataset(params) len_splits = {'train': 9900, 'val': 100, 'test': 2996} for splits in [[], None, ['val'], ['val', 'test']]: keep_n_captions(ds, 1, n=1, set_names=splits) if splits is not None: for split in splits: len_split = len_splits[split] assert eval('ds.len_' + split) == len_split assert eval('all(ds.loaded_' + split + ')') assert len(eval('ds.X_' + split + str([params['INPUTS_IDS_DATASET'][0]]))) == len_split assert len(eval('ds.Y_' + split + str([params['OUTPUTS_IDS_DATASET'][0]]))) == len_split if __name__ == '__main__': pytest.main([__file__])
from keras_wrapper.extra import evaluation Control_path = 'Control_M7.pred' list2file(Control_path, Control_predictions) dataset.setOutput('data/Ross_test.reply', 'test', type='text', id='target_text', pad_on_batch=True, tokenization='tokenize_basic', sample_weights=True, max_text_len=30, max_words=0) keep_n_captions(dataset, repeat=1, n=1, set_names=['test']) metric = 'coco' #Apply sampling extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + 'tokenize_basic') extra_vars['language'] = params['TRG_LAN'] extra_vars['test'] = dict() extra_vars['test']['references'] = dataset.extra_variables['test'][ 'target_text'] Control_metrics = evaluation.select[metric](pred_list=Control_predictions, verbose=1, extra_vars=extra_vars, split='test') print("Control:")
pad_on_batch=True, required=False) ds.setOutput('data/Cornell_train_reply.en', 'train', type='text', id='target_text', tokenization='tokenize_basic', build_vocabulary=True, pad_on_batch=True, sample_weights=True, max_text_len=30, max_words=30000, min_occ=0) ds.setOutput('data/Cornell_valid_reply.en', 'val', type='text', id='target_text', pad_on_batch=True, tokenization='tokenize_basic', sample_weights=True, max_text_len=30, max_words=0) #ds.merge_vocabularies(['source_text', 'target_text']) keep_n_captions(ds, repeat=1, n=1, set_names=['val']) saveDataset(ds, 'query_to_reply')
def start_training(use_gpu): ds = Dataset('tutorial_dataset', 'tutorial', silence=False) ds.setOutput(DATA_PATH + "train_y.txt", 'train', type='text', id='target_text', tokenization='tokenize_basic', build_vocabulary=True, pad_on_batch=True, sample_weights=True, max_text_len=30, max_words=30000, min_occ=0) ds.setOutput(DATA_PATH + "val_y.txt", 'val', type='text', id='target_text', pad_on_batch=True, tokenization='tokenize_basic', sample_weights=True, max_text_len=30, max_words=0) ds.setInput(DATA_PATH + "train_x.txt", 'train', type='text', id='source_text', pad_on_batch=True, tokenization='tokenize_basic', build_vocabulary=True, fill='end', max_text_len=30, max_words=30000, min_occ=0) ds.setInput(DATA_PATH + "val_x.txt", 'val', type='text', id='source_text', pad_on_batch=True, tokenization='tokenize_basic', fill='end', max_text_len=30, min_occ=0) ds.setInput(DATA_PATH + "train_y.txt", 'train', type='text', id='state_below', required=False, tokenization='tokenize_basic', pad_on_batch=True, build_vocabulary='target_text', offset=1, fill='end', max_text_len=30, max_words=30000) ds.setInput(None, 'val', type='ghost', id='state_below', required=False) for split, input_text_filename in zip( ['train', 'val'], [DATA_PATH + "train_x.txt", DATA_PATH + "val_x.txt"]): ds.setRawInput(input_text_filename, split, type='file-name', id='raw_source_text', overwrite_split=True) """We also need to match the references with the inputs. Since we only have one reference per input sample, we set `repeat=1`.""" keep_n_captions(ds, repeat=1, n=1, set_names=['val']) """Finally, we can save our dataset instance for using in other experiments:""" saveDataset(ds, MODEL_PATH + "/dataset") """## 2. Creating and training a Neural Translation Model Now, we'll create and train a Neural Machine Translation (NMT) model. Since there is a significant number of hyperparameters, we'll use the default ones, specified in the `config.py` file. Note that almost every hardcoded parameter is automatically set from config if we run `main.py `. We'll create an `'AttentionRNNEncoderDecoder'` (a LSTM encoder-decoder with attention mechanism). Refer to the [`model_zoo.py`](https://github.com/lvapeab/nmt-keras/blob/master/nmt_keras/model_zoo.py) file for other models (e.g. Transformer). So first, let's import the model and the hyperparameters. We'll also load the dataset we stored in the previous section (not necessary as it is in memory, but as a demonstration): """ params = load_parameters() dataset = loadDataset(MODEL_PATH + "/dataset/Dataset_tutorial_dataset.pkl") """Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance:""" params['MODEL_TYPE'] = 'Transformer' params['USE_CUDNN'] = use_gpu params['EARLY_STOP'] = True params['PATIENCE'] = 10 params['SAVE_EACH_EVALUATION'] = True params['STORE_PATH'] = MODEL_PATH params['N_LAYERS_ENCODER'] = 2 params['N_LAYERS_DECODER'] = 2 params['N_HEADS'] = 100 params['POS_UNK'] = False # current Transformer model requires this params[ 'ATTEND_ON_OUTPUT'] = True # current Transformer model requires this params['MODEL_SIZE'] = 100 params['SOURCE_TEXT_EMBEDDING_SIZE'] = 100 params['TARGET_TEXT_EMBEDDING_SIZE'] = 100 params['SKIP_VECTORS_HIDDEN_SIZE'] = 100 params['ENCODER_HIDDEN_SIZE'] = 100 params['DECODER_HIDDEN_SIZE'] = 100 params['APPLY_DETOKENIZATION'] = True params['LENGTH_PENALTY'] = True params['LENGTH_NORM_FACTOR'] = 0.8 params['MAX_INPUT_TEXT_LEN'] = 128 params['MAX_OUTPUT_TEXT_LEN'] = 128 params['STOP_METRIC'] = 'perplexity' params['BEAM_SIZE'] = 20 params['N_GPUS'] = 2 params['START_EVAL_ON_EPOCH'] = 1 params['BATCH_SIZE'] = 128 params['EVAL_EACH'] = 1 params['MAX_EPOCH'] = 100 params['PLOT_EVALULATION'] = True params['APPLY_DETOKENIZATION'] = True params['MODE'] = 'training' params['BEAM_SEARCH'] = True params['TENSORBOARD'] = True train_model(params, load_dataset=MODEL_PATH + "/dataset/Dataset_tutorial_dataset.pkl")
def start_training(use_gpu): ds = Dataset('tutorial_dataset', 'tutorial', silence=False) ds.setOutput(PATH + "train_correct.txt", 'train', type='text', id='target_text', tokenization='tokenize_basic', build_vocabulary=True, pad_on_batch=True, sample_weights=True, max_text_len=100, max_words=55000, min_occ=1) ds.setOutput(PATH + "validation_correct.txt", 'val', type='text', id='target_text', pad_on_batch=True, tokenization='tokenize_basic', sample_weights=True, max_text_len=100, max_words=0) ds.setInput(PATH + "train_error.txt", 'train', type='text', id='source_text', pad_on_batch=True, tokenization='tokenize_basic', build_vocabulary=True, fill='end', max_text_len=100, max_words=55000, min_occ=1) ds.setInput(PATH + "validation_error.txt", 'val', type='text', id='source_text', pad_on_batch=True, tokenization='tokenize_basic', fill='end', max_text_len=100, min_occ=1) """...and for the 'state_below' data. Note that: 1) The offset flat is set to 1, which means that the text will be shifted to the right 1 position. 2) During sampling time, we won't have this input. Hence, we 'hack' the dataset model by inserting an artificial input, of type 'ghost' for the validation split.""" ds.setInput(PATH + "train_correct.txt", 'train', type='text', id='state_below', required=False, tokenization='tokenize_basic', pad_on_batch=True, build_vocabulary='target_text', offset=1, fill='end', max_text_len=100, max_words=55000) ds.setInput(None, 'val', type='ghost', id='state_below', required=False) """We can also keep the literal source words (for replacing unknown words).""" for split, input_text_filename in zip( ['train', 'val'], [PATH + "train_error.txt", PATH + "validation_error.txt"]): ds.setRawInput(input_text_filename, split, type='file-name', id='raw_source_text', overwrite_split=True) """We also need to match the references with the inputs. Since we only have one reference per input sample, we set `repeat=1`.""" keep_n_captions(ds, repeat=1, n=1, set_names=['val']) """Finally, we can save our dataset instance for using in other experiments:""" saveDataset(ds, PATH + "dataset") """## 2. Creating and training a Neural Translation Model Now, we'll create and train a Neural Machine Translation (NMT) model. Since there is a significant number of hyperparameters, we'll use the default ones, specified in the `config.py` file. Note that almost every hardcoded parameter is automatically set from config if we run `main.py `. We'll create an `'AttentionRNNEncoderDecoder'` (a LSTM encoder-decoder with attention mechanism). Refer to the [`model_zoo.py`](https://github.com/lvapeab/nmt-keras/blob/master/nmt_keras/model_zoo.py) file for other models (e.g. Transformer). So first, let's import the model and the hyperparameters. We'll also load the dataset we stored in the previous section (not necessary as it is in memory, but as a demonstration): """ params = load_parameters() dataset = loadDataset(PATH + "dataset/Dataset_tutorial_dataset.pkl") """Since the number of words in the dataset may be unknown beforehand, we must update the params information according to the dataset instance:""" params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['source_text'] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len['target_text'] params['USE_CUDNN'] = use_gpu params['N_GPUS'] = 2 params['MAX_EPOCH'] = 1000 params['EARLY_STOP'] = True params['PATIENCE'] = 10 params['SAVE_EACH_EVALUATION'] = True params['STORE_PATH'] = PATH + "model/" params['BATCH_SIZE'] = 128 params['ATTENTION_MODE'] = "add" params['N_LAYERS_ENCODER'] = 2 params['N_LAYERS_DECODER'] = 2 params['SOURCE_TEXT_EMBEDDING_SIZE'] = 512 params['TARGET_TEXT_EMBEDDING_SIZE'] = 512 params['SKIP_VECTORS_HIDDEN_SIZE'] = 512 params['ATTENTION_SIZE'] = 512 params['ENCODER_HIDDEN_SIZE'] = 512 params['DECODER_HIDDEN_SIZE'] = 512 params['ENCODER_RNN_TYPE'] = "LSTM" params['DECODER_RNN_TYPE'] = "ConditionalLSTM" params['METRICS'] = ['coco'] params['KERAS_METRICS'] = ['perplexity'] params['APPLY_DETOKENIZATION'] = True params['LENGTH_PENALTY'] = True params['LENGTH_NORM_FACTOR'] = 1.0 params['BEAM_SIZE'] = 1 params['BEAM_SEARCH'] = True params['PLOT_EVALUATION'] = True params['MAX_PLOT_Y'] = 1. params['MODE'] = 'training' params['TENSORBOARD'] = True result = pyfiglet.figlet_format("START TRAINING FROM SCRATCH".format(mode), font="digital") print(result) train_model(params, load_dataset=os.getcwd() + "/dataset/Dataset_tutorial_dataset.pkl")