def test_load_dataset(): params = load_parameters() ds = loadDataset('./Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') assert isinstance(ds, Dataset) assert isinstance(ds.vocabulary, dict) assert ds.vocabulary.keys() >= 3 for voc in ds.vocabulary: assert len(ds.vocabulary[voc].keys()) == 2
def test_load_dataset(self): params = load_parameters() ds = loadDataset('./Dataset_'+ params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') self.assertIsInstance(ds, Dataset) self.assertIsInstance(ds.vocabulary, dict) self.assertGreaterEqual(ds.vocabulary.keys(), 3) for voc in ds.vocabulary: self.assertEqual(len(ds.vocabulary[voc].keys()), 2)
def build_dataset(params): if params['REBUILD_DATASET']: # We build a new dataset instance if(params['VERBOSE'] > 0): silence=False logging.info('Building ' + params['DATASET_NAME'] + ' dataset') else: silence=True base_path = params['DATA_ROOT_PATH'] name = params['DATASET_NAME'] ds = Dataset(name, base_path, silence=silence) ##### INPUT DATA # Let's load the associated images (inputs) num_cap = 1 # We only extract one feature vector per image list_train = base_path + '/' + params['IMG_FILES']['train'][0] list_val = base_path + '/' + params['IMG_FILES']['val'][0] list_test = base_path + '/' + params['IMG_FILES']['test'][0] ds.setInput(list_train, 'train', type='raw-image', id=params['INPUTS_IDS_DATASET'][0], img_size=params['IMG_SIZE'], img_size_crop=params['IMG_CROP_SIZE'], repeat_set=num_cap) ds.setInput(list_val, 'val', type='raw-image', id=params['INPUTS_IDS_DATASET'][0], img_size=params['IMG_SIZE'], img_size_crop=params['IMG_CROP_SIZE'], repeat_set=num_cap) ds.setInput(list_test, 'test', type='raw-image', id=params['INPUTS_IDS_DATASET'][0], img_size=params['IMG_SIZE'], img_size_crop=params['IMG_CROP_SIZE'], repeat_set=num_cap) ### IMAGES' associated IDs ds.setInput(base_path + '/' + params['IMG_FILES']['train'][1], 'train', type='id', id=params['INPUTS_IDS_DATASET'][0] + '_ids', repeat_set=num_cap) ds.setInput(base_path + '/' + params['IMG_FILES']['val'][1], 'val', type='id', id=params['INPUTS_IDS_DATASET'][0] + '_ids', repeat_set=num_cap) ds.setInput(base_path + '/' + params['IMG_FILES']['test'][1], 'test', type='id', id=params['INPUTS_IDS_DATASET'][0] + '_ids', repeat_set=num_cap) # Train mean ds.setTrainMean(params['MEAN_IMAGE'], params['INPUTS_IDS_DATASET'][0]) ###### OUTPUT DATA: None # Process dataset for keeping only one caption per image and storing the rest in a dict() with the following format: # ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN] #keep_n_captions(ds, repeat=[1, 1], n=1, set_names=['val','test']) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATASET_STORE_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATASET_STORE_PATH']+'/Dataset_'+params['DATASET_NAME']+'.pkl') return ds
def loadMSVD(): logging.info('Loading MSVD dataset') # Build basic dataset structure # we assign it a name and the path were the images are stored base_path = '/media/HDD_2TB/DATASETS/MSVD/' name = 'MSVD_VideoDescription' ds = Dataset(name, base_path) max_text_len = 35 # Let's load the train, val and test splits of the descriptions (outputs) # the files include a description per line. In this dataset a variable number # of descriptions per video are provided. ds.setOutput(base_path + 'train_descriptions.txt', 'train', type='text', id='descriptions', tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len) ds.setOutput(base_path + 'val_descriptions.txt', 'val', type='text', id='descriptions', tokenization='tokenize_basic', max_text_len=max_text_len) ds.setOutput(base_path + 'test_descriptions.txt', 'test', type='text', id='descriptions', tokenization='tokenize_basic', max_text_len=max_text_len) # Let's load the associated videos (inputs) # we must take into account that in this dataset we have a different number of sentences per video, # for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list # containing the number of captions in each video. num_captions_train = np.load(base_path + 'train_descriptions_counts.npy') num_captions_val = np.load(base_path + 'val_descriptions_counts.npy') num_captions_test = np.load(base_path + 'test_descriptions_counts.npy') ds.setInput([base_path + 'train_imgs_list.txt', base_path + 'train_imgs_counts.txt'], 'train', type='video', id='videos', repeat_set=num_captions_train) ds.setInput([base_path + 'val_imgs_list.txt', base_path + 'val_imgs_counts.txt'], 'val', type='video', id='videos', repeat_set=num_captions_val) ds.setInput([base_path + 'test_imgs_list.txt', base_path + 'test_imgs_counts.txt'], 'test', type='video', id='videos', repeat_set=num_captions_test) # Now let's set the dataset mean image for preprocessing the data ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], id='videos') # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, 'Datasets') # We can easily recover it with a single line ds = loadDataset('Datasets/Dataset_' + name + '.pkl') # Lets recover the first batch of data [X, Y] = ds.getXY('train', 10) logging.info('Sample data loaded correctly.')
def loadMSVD(): logging.info('Loading MSVD dataset') # Build basic dataset structure # we assign it a name and the path were the images are stored base_path = '/media/HDD_2TB/DATASETS/MSVD/' name = 'MSVD_VideoDescription' ds = Dataset(name, base_path) max_text_len = 35 # Let's load the train, val and test splits of the descriptions (outputs) # the files include a description per line. In this dataset a variable number # of descriptions per video are provided. ds.setOutput(base_path + 'train_descriptions.txt', 'train', type='text', id='descriptions', tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len) ds.setOutput(base_path + 'val_descriptions.txt', 'val', type='text', id='descriptions', tokenization='tokenize_basic', max_text_len=max_text_len) ds.setOutput(base_path + 'test_descriptions.txt', 'test', type='text', id='descriptions', tokenization='tokenize_basic', max_text_len=max_text_len) # Let's load the associated videos (inputs) # we must take into account that in this dataset we have a different number of sentences per video, # for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list # containing the number of captions in each video. num_captions_train = np.load(base_path + 'train_descriptions_counts.npy') num_captions_val = np.load(base_path + 'val_descriptions_counts.npy') num_captions_test = np.load(base_path + 'test_descriptions_counts.npy') ds.setInput([base_path + 'train_imgs_list.txt', base_path + 'train_imgs_counts.txt'], 'train', type='video', id='videos', repeat_set=num_captions_train) ds.setInput([base_path + 'val_imgs_list.txt', base_path + 'val_imgs_counts.txt'], 'val', type='video', id='videos', repeat_set=num_captions_val) ds.setInput([base_path + 'test_imgs_list.txt', base_path + 'test_imgs_counts.txt'], 'test', type='video', id='videos', repeat_set=num_captions_test) # Now let's set the dataset mean image for preprocessing the data ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='videos') # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, 'Datasets') # We can easily recover it with a single line ds = loadDataset('Datasets/Dataset_' + name + '.pkl') # Lets recover the first batch of data [X, Y] = ds.getXY('train', 10) logging.info('Sample data loaded correctly.')
def test_load_dataset(): params = load_parameters() ds = loadDataset(os.path.join('datasets', 'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[ 'TRG_LAN'] + '.pkl')) assert isinstance(ds, Dataset) assert isinstance(ds.vocabulary, dict) assert len(list(ds.vocabulary)) >= 3 for voc in ds.vocabulary: assert len(list(ds.vocabulary[voc])) == 2
def score_corpus(args, params): print "Using an ensemble of %d models" % len(args.models) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) if args.source is not None: dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) for s in args.splits: # Apply model predictions params_prediction = {'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s]} if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get('COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get('MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get('MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get('MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get('MINLEN_GIVEN_X_FACTOR', 2) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print scores
def loadFood101(): logging.info('Loading Food101 dataset') logging.info( 'INFO: in order to load this dataset it must be placed in ../data/Food101/images/ after downloading it form https://www.vision.ee.ethz.ch/datasets_extra/food-101/' ) base_path = '../data/Food101/' name = 'Food101' ds = Dataset(name, base_path + 'images') # Insert inputs (images) ds.setInput(base_path + 'meta/train_split.txt', 'train', type='image', id='images', img_size_crop=[227, 227, 3]) ds.setInput(base_path + 'meta/val_split.txt', 'val', type='image', id='images') ds.setInput(base_path + 'meta/test.txt', 'test', type='image', id='images') # Insert outputs (labels) ds.setOutput(base_path + 'meta/train_labels.txt', 'train', type='categorical', id='labels') ds.setOutput(base_path + 'meta/val_labels.txt', 'val', type='categorical', id='labels') ds.setOutput(base_path + 'meta/test_labels.txt', 'test', type='categorical', id='labels') # Set list of classes (strings) ds.setClasses(base_path + 'meta/classes.txt', 'labels') # Now let's set the dataset mean image for preprocessing the data ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='images') # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, 'Datasets') # We can easily recover it with a single line ds = loadDataset('Datasets/Dataset_' + name + '.pkl') # Lets recover the first batch of data [X, Y] = ds.getXY('train', 10) logging.info('Sample data loaded correctly.')
def __init__(self): self.session = tf.Session() self.graph = tf.get_default_graph() with self.graph.as_default(): with self.session.as_default(): dataset = loadDataset("dataset/Dataset_tutorial_dataset.pkl") nmt_model = loadModel("", epoch_num) params = nmt_model.params inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) params_prediction = { 'language': 'en', 'tokenize_f': eval('dataset.' + 'tokenize_basic'), 'beam_size': 2, 'optimized_search': True, 'model_inputs': params['INPUTS_IDS_MODEL'], 'model_outputs': params['OUTPUTS_IDS_MODEL'], 'dataset_inputs': params['INPUTS_IDS_DATASET'], 'dataset_outputs': params['OUTPUTS_IDS_DATASET'], 'n_parallel_loaders': 1, 'maxlen': 50, 'model_inputs': ['source_text', 'state_below'], 'model_outputs': ['target_text'], 'dataset_inputs': ['source_text', 'state_below'], 'dataset_outputs': ['target_text'], 'normalize': True, 'pos_unk': True, 'heuristic': 0, 'state_below_maxlen': -1, 'length_norm_factor': 1.0, 'length_penalty': True, 'predict_on_sets': ['test'], 'verbose': 0, } self.params = params self.dataset = dataset self.nmt_model = nmt_model self.params_prediction = params_prediction
def loadFlickr8k(): logging.info('Loading Flickr8k dataset') # Build basic dataset structure # we assign it a name and the path were the images are stored base_path = '/media/HDD_2TB/DATASETS/Flickr8k/' name = 'Flickr8k_ImageDescription' ds = Dataset(name, base_path + 'Flicker8k_Dataset') max_text_len = 35 # Let's load the train, val and test splits of the descriptions (outputs) # the files include a description per line # and a set of 5 consecutive descriptions correspond to a single input image ds.setOutput(base_path + 'text/train_descriptions.txt', 'train', type='text', id='descriptions', tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len) ds.setOutput(base_path + 'text/val_descriptions.txt', 'val', type='text', id='descriptions', tokenization='tokenize_basic', max_text_len=max_text_len) ds.setOutput(base_path + 'text/test_descriptions.txt', 'test', type='text', id='descriptions', tokenization='tokenize_basic', max_text_len=max_text_len) # Let's load the associated images (inputs) # we must take into account that in this dataset we have 5 sentences per image, # for this reason we introduce the parameter 'repeat_set'=5 ds.setInput(base_path + 'text/Flickr_8k.trainImages.txt', 'train', type='image', id='images', repeat_set=5) ds.setInput(base_path + 'text/Flickr_8k.devImages.txt', 'val', type='image', id='images', repeat_set=5) ds.setInput(base_path + 'text/Flickr_8k.testImages.txt', 'test', type='image', id='images', repeat_set=5) # Now let's set the dataset mean image for preprocessing the data ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], id='images') # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, 'Datasets') # We can easily recover it with a single line ds = loadDataset('Datasets/Dataset_' + name + '.pkl') # Lets recover the first batch of data [X, Y] = ds.getXY('train', 10) logging.info('Sample data loaded correctly.')
def loadFlickr8k(): logging.info('Loading Flickr8k dataset') # Build basic dataset structure # we assign it a name and the path were the images are stored base_path = '/media/HDD_2TB/DATASETS/Flickr8k/' name = 'Flickr8k_ImageDescription' ds = Dataset(name, base_path + 'Flicker8k_Dataset') max_text_len = 35 # Let's load the train, val and test splits of the descriptions (outputs) # the files include a description per line # and a set of 5 consecutive descriptions correspond to a single input image ds.setOutput(base_path + 'text/train_descriptions.txt', 'train', type='text', id='descriptions', tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len) ds.setOutput(base_path + 'text/val_descriptions.txt', 'val', type='text', id='descriptions', tokenization='tokenize_basic', max_text_len=max_text_len) ds.setOutput(base_path + 'text/test_descriptions.txt', 'test', type='text', id='descriptions', tokenization='tokenize_basic', max_text_len=max_text_len) # Let's load the associated images (inputs) # we must take into account that in this dataset we have 5 sentences per image, # for this reason we introduce the parameter 'repeat_set'=5 ds.setInput(base_path + 'text/Flickr_8k.trainImages.txt', 'train', type='image', id='images', repeat_set=5) ds.setInput(base_path + 'text/Flickr_8k.devImages.txt', 'val', type='image', id='images', repeat_set=5) ds.setInput(base_path + 'text/Flickr_8k.testImages.txt', 'test', type='image', id='images', repeat_set=5) # Now let's set the dataset mean image for preprocessing the data ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='images') # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, 'Datasets') # We can easily recover it with a single line ds = loadDataset('Datasets/Dataset_' + name + '.pkl') # Lets recover the first batch of data [X, Y] = ds.getXY('train', 10) logging.info('Sample data loaded correctly. %d input samples. %d output samples' % (len(X), len(Y)))
def evaluate_from_file(args): """ Evaluate translation hypotheses from a file or a list of files of references. :param args: Evaluation parameters :return: None """ language = args.language hypotheses_file = codecs.open(args.hypotheses, 'r', encoding='utf-8') ds = loadDataset(args.dataset) references = ds.extra_variables[args.split][list( ds.extra_variables[args.split].keys())[0]] step_size = args.step_size ref, hypothesis = load_textfiles(references, hypotheses_file) if step_size < 1: score = CocoScore(ref, hypothesis, metrics_list=args.metrics, language=language) print("Scores: ") max_score_name_len = max([len(x) for x in list(score)]) for score_name in sorted(list(score)): print("\t {0:{1}}".format(score_name, max_score_name_len) + ": %.5f" % score[score_name]) else: n = 0 while True: n += step_size indices = range(min(n, len(ref))) partial_refs = {} partial_hyps = {} for i in indices: partial_refs[i] = ref[i] partial_hyps[i] = hypothesis[i] score = CocoScore(partial_refs, partial_hyps, metrics_list=args.metrics, language=language) print(str(min(n, len(ref))) + " \tScore: ", score) if n > len(ref): break return
def loadFood101(): logging.info('Loading Food101 dataset') logging.info( 'INFO: in order to load this dataset it must be placed in ../data/Food101/images/ after downloading it form https://www.vision.ee.ethz.ch/datasets_extra/food-101/') base_path = '../data/Food101/' name = 'Food101' ds = Dataset(name, base_path + 'images') # Insert inputs (images) ds.setInput(base_path + 'meta/train_split.txt', 'train', type='image', id='images', img_size_crop=[227, 227, 3]) ds.setInput(base_path + 'meta/val_split.txt', 'val', type='image', id='images') ds.setInput(base_path + 'meta/test.txt', 'test', type='image', id='images') # Insert outputs (labels) ds.setOutput(base_path + 'meta/train_labels.txt', 'train', type='categorical', id='labels') ds.setOutput(base_path + 'meta/val_labels.txt', 'val', type='categorical', id='labels') ds.setOutput(base_path + 'meta/test_labels.txt', 'test', type='categorical', id='labels') # Set list of classes (strings) ds.setClasses(base_path + 'meta/classes.txt', 'labels') # Now let's set the dataset mean image for preprocessing the data ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='images') # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, 'Datasets') # We can easily recover it with a single line ds = loadDataset('Datasets/Dataset_' + name + '.pkl') # Lets recover the first batch of data [X, Y] = ds.getXY('train', 10) logging.info('Sample data loaded correctly.')
def get_model_predictions(asts_path): print("os.getcwd()", os.getcwd()) cur_dir = os.path.dirname(os.path.abspath(__file__)) print("cur_dir", cur_dir) # if not os.path.isdir(os.path.join(os.getcwd(), 'keras')): # print(subprocess.run(f'git clone https://github.com/MarcBS/keras.git', shell=True, # stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)) # # nmt_keras_dir = os.path.join(os.getcwd, 'nmt-keras') # if not os.path.isdir(os.path.join(os.getcwd(), 'nmt-keras')): # print(subprocess.run(f'git clone https://github.com/lvapeab/nmt-keras && cd "nmt-keras" && pipenv install -e .', shell=True, # stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)) # # print(subprocess.run(f'cd {nmt_keras_dir} && pipenv install -e .', shell=True, # # stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)) # print("ran cmds!!!") # sys.path.insert(0, os.path.join(os.getcwd(), 'nmt-keras')) # print("sys path!!!", sys.path) dataset = loadDataset( f'{cur_dir}/assets/epoch_{MODEL_EPOCH}_model_wrapper.pkl') with open('{cur_dir}/assets/params.json', 'r') as params_file: params = json.load(params_file) dataset.setInput(asts_path, 'test', type='text', id='source_text', pad_on_batch=True, tokenization=params['tokenize_x'], fill='end', max_text_len=params['x_max_text_len'], min_occ=0) dataset.setInput(None, 'test', type='ghost', id='state_below', required=False) dataset.setRawInput(asts_path, 'test', type='file-name', id='raw_source_text', overwrite_split=True) nmt_model = loadModel('{cur_dir}/assets', MODEL_EPOCH) prediction_params = get_prediction_params() predictions = nmt_model.predictBeamSearchNet(dataset, params_prediction)['test'] vocab = dataset.vocabulary['target_text']['idx2words'] samples = predictions['samples'] # Get word indices from the samples. predictions = decode_predictions_beam_search(samples, vocab, verbose=params['VERBOSE']) return predictions
from keras_wrapper.model_ensemble import BeamSearchEnsemble import os """## 3. Decoding with a trained Neural Machine Translation Model Now, we'll load from disk the model we just trained and we'll apply it for translating new text. In this case, we want to translate the 'test' split from our dataset. Since we want to translate a new data split ('test') we must add it to the dataset instance, just as we did before (at the first tutorial). In case we also had the refences of the test split and we wanted to evaluate it, we can add it to the dataset. Note that this is not mandatory and we could just predict without evaluating. """ MODEL_PATH1 = os.path.join(os.getcwd(), 'models/empathy_100_hidden') MODEL_PATH2 = os.path.join(os.getcwd(), 'models/persona_chat_lstm') epoch_choice1 = 6 epoch_choice2 = 8 dataset1 = loadDataset(os.path.join(MODEL_PATH1, "dataset/Dataset_tutorial_dataset.pkl")) dataset2 = loadDataset(os.path.join(MODEL_PATH1, "dataset/Dataset_tutorial_dataset.pkl")) dataset2 = update_dataset_from_file(dataset2, args.text, params, splits=args.splits, remove_outputs=True) # Load model nmt_model1 = loadModel(MODEL_PATH1, epoch_choice1) nmt_model2 = loadModel(MODEL_PATH2, epoch_choice2) params = nmt_model1.params # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in)
def build_dataset(params): if params['REBUILD_DATASET']: # We build a new dataset instance if(params['VERBOSE'] > 0): silence=False logging.info('Building ' + params['DATASET_NAME'] + ' dataset') else: silence=True base_path = params['DATA_ROOT_PATH'] name = params['DATASET_NAME'] ds = Dataset(name, base_path, silence=silence) max_text_len = params['MAX_INPUT_TEXT_LEN'] ##### INPUT DATA ### QUESTIONS ds.setInput(base_path+'/'+params['QST_FILES']['train'][0], 'train', type='text', id=params['INPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], build_vocabulary=True, fill=params['FILL'], max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'], repeat_set=params['REPEAT_QST']) ds.setInput(base_path+'/'+params['QST_FILES']['val'][0], 'val', type='text', id=params['INPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'], repeat_set=params['REPEAT_QST']) ds.setInput(base_path+'/'+params['QST_FILES']['test'][0], 'test', type='text', id=params['INPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'], repeat_set=params['REPEAT_QST']) ### QUESTIONS' associated IDs ds.setInput(base_path+'/'+params['QST_FILES']['train'][1], 'train', type='id', id=params['INPUTS_IDS_DATASET'][0]+'_ids', repeat_set=params['REPEAT_QST']) ds.setInput(base_path+'/'+params['QST_FILES']['val'][1], 'val', type='id', id=params['INPUTS_IDS_DATASET'][0]+'_ids', repeat_set=params['REPEAT_QST']) ds.setInput(base_path+'/'+params['QST_FILES']['test'][1], 'test', type='id', id=params['INPUTS_IDS_DATASET'][0]+'_ids', repeat_set=params['REPEAT_QST']) ### IMAGES ds.setInput(base_path+'/'+params['IMG_FILES']['train'][0], 'train', type='image-features', id=params['INPUTS_IDS_DATASET'][1], feat_len=params['IMG_FEAT_SIZE'], repeat_set=params['REPEAT_IMG']) ds.setInput(base_path+'/'+params['IMG_FILES']['val'][0], 'val', type='image-features', id=params['INPUTS_IDS_DATASET'][1], feat_len=params['IMG_FEAT_SIZE'], repeat_set=params['REPEAT_IMG']) ds.setInput(base_path+'/'+params['IMG_FILES']['test'][0], 'test', type='image-features', id=params['INPUTS_IDS_DATASET'][1], feat_len=params['IMG_FEAT_SIZE'], repeat_set=params['REPEAT_IMG']) ### IMAGES' associated IDs ds.setInput(base_path+'/'+params['IMG_FILES']['train'][1], 'train', type='id', id=params['INPUTS_IDS_DATASET'][1]+'_ids', repeat_set=params['REPEAT_IMG']) ds.setInput(base_path+'/'+params['IMG_FILES']['val'][1], 'val', type='id', id=params['INPUTS_IDS_DATASET'][1]+'_ids', repeat_set=params['REPEAT_IMG']) ds.setInput(base_path+'/'+params['IMG_FILES']['test'][1], 'test', type='id', id=params['INPUTS_IDS_DATASET'][1]+'_ids', repeat_set=params['REPEAT_IMG']) ##### OUTPUT DATA ### ANSWERS ds.setOutput(base_path+'/'+params['ANS_FILES']['train'][0], 'train', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], build_vocabulary=True, fill=params['FILL'], max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE']) ds.setOutput(base_path+'/'+params['ANS_FILES']['val'][0], 'val', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE']) if 'test' in params['ANS_FILES']: ds.setOutput(base_path+'/'+params['ANS_FILES']['test'][0], 'test', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE']) # Load extra variables (we need the original path to questions and annotations for VQA evaluation) ds.extra_variables['train'] = dict() ds.extra_variables['val'] = dict() ds.extra_variables['test'] = dict() ds.extra_variables['train']['quesFile'] = base_path+'/'+params['QST_FILES']['train'][2] ds.extra_variables['val']['quesFile'] = base_path+'/'+params['QST_FILES']['val'][2] ds.extra_variables['test']['quesFile'] = base_path+'/'+params['QST_FILES']['test'][2] ds.extra_variables['train']['annFile'] = base_path+'/'+params['ANS_FILES']['train'][1] ds.extra_variables['val']['annFile'] = base_path+'/'+params['ANS_FILES']['val'][1] if 'test' in params['ANS_FILES']: ds.extra_variables['test']['annFile'] = base_path+'/'+params['ANS_FILES']['tes'][1] # Remove all samples of the train set not belonging to the top classes chosen if params['KEEP_TOP_ANSWERS']: ds.keepTopOutputs('train', params['OUTPUTS_IDS_DATASET'][0], params['OUTPUT_VOCABULARY_SIZE']) # Filter top K answers per question-image pair if params['FILTER_ANSWERS']: filter_k_frequent_answers(ds, params) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATA_ROOT_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATA_ROOT_PATH']+'/Dataset_'+params['DATASET_NAME']+'.pkl') return ds
def classifyFood101(): from keras_wrapper.cnn_model import CNN_Model, loadModel, saveModel logging.info('Defining CNN model and training it.') # Load food classification dataset dataset_name = 'Food101' ds = loadDataset('Datasets/Dataset_' + dataset_name + '.pkl') # The network we are going to use needs an image input size of [224,224,3] # for this reason we have to communicate this to the dataset instance in charge of loading the data ds.img_size_crop['images'] = [224, 224, 3] # Create VGG model and load weights model_name = 'VGG_16_FunctionalAPI' net = CNN_Model( type='VGG_16_FunctionalAPI', model_name=model_name, input_shape=[224, 224, 3], weights_path='/media/HDD_2TB/CNN_MODELS/VGG/vgg16_weights.h5', seq_to_functional=True ) # we are setting the weights of a Sequential model into a FunctionalAPI one # Reformat net output layer for the number of classes in our dataset n_classes = len(ds.classes['labels']) vis_input = net.model.get_layer('vis_input').output # input layer drop = net.model.get_layer('last_dropout').output # layer before final FC output = Dense(n_classes, activation='softmax', name='output')(drop) # redefine FC-softmax layer net.model = Model(input=vis_input, output=output) # define inputs and outputs # Compile net.setOptimizer(lr=0.001, metrics=['accuracy']) # Define the inputs and outputs mapping from our Dataset instance to our CNN_Model instance # set input and output mappings from dataset to network pos_images = ds.types_inputs.index('image') pos_labels = ds.types_outputs.index('categorical') # the first input of our dataset (pos_images) will also be the first input of our model (named vis_input) inputMapping = {'vis_input': pos_images} net.setInputsMapping(inputMapping) # the first output of our dataset (pos_labels) will also be the first output of our model (named output) outputMapping = {'output': pos_labels} net.setOutputsMapping(outputMapping, acc_output='output') # Save model saveModel(net, 0) # Load model net = loadModel('Models/' + model_name, 0) # the model must be compiled again when loaded net.setOptimizer(lr=0.001, metrics=['accuracy']) # Apply short training (1 epoch) # training_params = {'n_epochs': 1, 'batch_size': 50, # 'lr_decay': 2, 'lr_gamma': 0.8, # 'epochs_for_save': 1, 'verbose': 1, 'eval_on_sets': ['val']} # net.trainNet(ds, training_params) # Test network on test set test_params = {'batch_size': 50} # net.testNet(ds, test_params) # Predict network on all sets test_params['predict_on_sets'] = ['val'] predictions = net.predictNet(ds, test_params) logging.info("Predicted %d samples." % (len(predictions))) logging.info("Done")
def score_corpus(args, params): """ Use one or several translation models for scoring source--target pairs- :param argparse.Namespace args: Arguments given to the method: * dataset: Dataset instance with data. * source: Text file with source sentences. * target: Text file with target sentences. * splits: Splits to sample. Should be already included in the dataset object. * dest: Output file to save scores. * weights: Weight given to each model in the ensemble. You should provide the same number of weights than models. By default, it applies the same weight to each model (1/N). * verbose: Be verbose or not. * config: Config .pkl for loading the model configuration. If not specified, hyperparameters are read from config.py. * models: Path to the models. :param dict params: parameters of the translation model. """ from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.dataset import loadDataset from keras_wrapper.cnn_model import loadModel from keras_wrapper.model_ensemble import BeamSearchEnsemble logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(float, model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get( 'NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get( 'COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get( 'LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print(scores)
def classifyFood101(): from keras_wrapper.cnn_model import CNN_Model, loadModel, saveModel logging.info('Defining CNN model and training it.') # Load food classification dataset dataset_name = 'Food101' ds = loadDataset('Datasets/Dataset_' + dataset_name + '.pkl') # The network we are going to use needs an image input size of [224,224,3] # for this reason we have to communicate this to the dataset instance in charge of loading the data ds.img_size_crop['images'] = [224, 224, 3] # Create VGG model and load weights model_name = 'VGG_16_FunctionalAPI' net = CNN_Model(type='VGG_16_FunctionalAPI', model_name=model_name, input_shape=[224, 224, 3], weights_path='/media/HDD_2TB/CNN_MODELS/VGG/vgg16_weights.h5', seq_to_functional=True) # we are setting the weights of a Sequential model into a FunctionalAPI one # Reformat net output layer for the number of classes in our dataset n_classes = len(ds.classes['labels']) vis_input = net.model.get_layer('vis_input').output # input layer drop = net.model.get_layer('last_dropout').output # layer before final FC output = Dense(n_classes, activation='softmax', name='output')(drop) # redefine FC-softmax layer net.model = Model(input=vis_input, output=output) # define inputs and outputs # Compile net.setOptimizer(lr=0.001, metrics=['accuracy']) # Define the inputs and outputs mapping from our Dataset instance to our CNN_Model instance # set input and output mappings from dataset to network pos_images = ds.types_inputs.index('image') pos_labels = ds.types_outputs.index('categorical') # the first input of our dataset (pos_images) will also be the first input of our model (named vis_input) inputMapping = {'vis_input': pos_images} net.setInputsMapping(inputMapping) # the first output of our dataset (pos_labels) will also be the first output of our model (named output) outputMapping = {'output': pos_labels} net.setOutputsMapping(outputMapping, acc_output='output') # Save model saveModel(net, 0) # Load model net = loadModel('Models/' + model_name, 0) # the model must be compiled again when loaded net.setOptimizer(lr=0.001, metrics=['accuracy']) # Apply short training (1 epoch) # training_params = {'n_epochs': 1, 'batch_size': 50, # 'lr_decay': 2, 'lr_gamma': 0.8, # 'epochs_for_save': 1, 'verbose': 1, 'eval_on_sets': ['val']} # net.trainNet(ds, training_params) # Test network on test set test_params = {'batch_size': 50} # net.testNet(ds, test_params) # Predict network on all sets test_params['predict_on_sets'] = ['val'] predictions = net.predictNet(ds, test_params) logging.info("Predicted %d samples." % (len(predictions))) logging.info("Done")
def build_dataset(params): if params['REBUILD_DATASET']: base_path = params['DATA_ROOT_PATH'] name = params['DATASET_NAME'] ds = Dataset(name, base_path, silence=False) # INPUT DATA ds.setInput(base_path + '/' + params['DISHES_FILES']['train'], 'train', type='text', id=params['INPUTS_IDS_DATASET'][1], build_vocabulary=True, tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], pad_on_batch=True, max_text_len=params['MAX_OUTPUT_TEXT_LEN'], min_occ=params['MIN_OCCURRENCES_VOCAB']) ds.setInput(base_path + '/' + params['DISHES_FILES']['val'], 'val', type='text', id=params['INPUTS_IDS_DATASET'][1], build_vocabulary=True, pad_on_batch=True, tokenization=params['TOKENIZATION_METHOD'], max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], min_occ=params['MIN_OCCURRENCES_VOCAB']) ds.setInput(base_path + '/' + params['DISHES_FILES']['test'], 'test', type='text', id=params['INPUTS_IDS_DATASET'][1], build_vocabulary=True, pad_on_batch=True, tokenization=params['TOKENIZATION_METHOD'], max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], min_occ=params['MIN_OCCURRENCES_VOCAB']) # INPUT DATA ds.setInput(base_path + '/' + params['IMAGES_LIST_FILES']['train'], 'train', type='image-features', id=params['INPUTS_IDS_DATASET'][0], feat_len=params['IMG_FEAT_SIZE']) ds.setInput(base_path + '/' + params['IMAGES_LIST_FILES']['val'], 'val', type='image-features', id=params['INPUTS_IDS_DATASET'][0], feat_len=params['IMG_FEAT_SIZE']) ds.setInput(base_path + '/' + params['IMAGES_LIST_FILES']['test'], 'test', type='image-features', id=params['INPUTS_IDS_DATASET'][0], feat_len=params['IMG_FEAT_SIZE']) # INPUT DATA ds.setInput(base_path + '/' + params['CNN_FILES']['train'], 'train', type='image-features', id=params['INPUTS_IDS_DATASET'][2], feat_len=params['CNN_SIZE']) ds.setInput(base_path + '/' + params['CNN_FILES']['val'], 'val', type='image-features', id=params['INPUTS_IDS_DATASET'][2], feat_len=params['CNN_SIZE']) ds.setInput(base_path + '/' + params['CNN_FILES']['test'], 'test', type='image-features', id=params['INPUTS_IDS_DATASET'][2], feat_len=params['CNN_SIZE']) # OUTPUT DATA if "sample_weight" not in params or params['sample_weight']: ds.setOutput(base_path + '/' + params['OUT_FILES']['train'], 'train', type='real', id=params['OUTPUTS_IDS_DATASET'][0], sample_weights=np.load(Path.DATA_FOLDER + "/data/weights.npy")) else: ds.setOutput(base_path + '/' + params['OUT_FILES']['train'], 'train', type='real', id=params['OUTPUTS_IDS_DATASET'][0]) ds.setOutput(base_path + '/' + params['OUT_FILES']['val'], 'val', type='real', id=params['OUTPUTS_IDS_DATASET'][0]) ds.setOutput(base_path + '/' + params['OUT_FILES']['test'], 'test', type='real', id=params['OUTPUTS_IDS_DATASET'][0]) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATASET_STORE_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl') return ds
import sys sys.path.append('../nmt-keras') sys.path.append('../nmt-keras/nmt_keras') import utils from config import load_parameters from data_engine.prepare_data import keep_n_captions from keras_wrapper.cnn_model import loadModel from keras_wrapper.dataset import loadDataset from keras_wrapper.utils import decode_predictions_beam_search from model_zoo import TranslationModel params = load_parameters() dataset = loadDataset('query_to_reply/Dataset_Cornell_base.pkl') dataset.setInput('data/Ross_test.query', 'test', type='text', id='source_text', pad_on_batch=True, tokenization='tokenize_basic', fill='end', max_text_len=100, min_occ=0) dataset.setInput(None, 'test', type='ghost', id='state_below', required=False) ## get model predictions params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]]
def train_model(params, load_dataset=None): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param params: Dictionary of network hyperparameters. :param load_dataset: Load dataset from file or build it from the parameters. :return: None """ check_params(params) if params['RELOAD'] > 0: logging.info('Resuming training.') # Load data if load_dataset is None: if params['REBUILD_DATASET']: logging.info('Rebuilding dataset.') dataset = build_dataset(params) else: logging.info('Updating dataset.') dataset = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) for split, filename in params['TEXT_FILES'].iteritems(): dataset = update_dataset_from_file( dataset, params['DATA_ROOT_PATH'] + '/' + filename + params['SRC_LAN'], params, splits=list([split]), output_text_filename=params['DATA_ROOT_PATH'] + '/' + filename + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) dataset.name = params['DATASET_NAME'] + '_' + params[ 'SRC_LAN'] + params['TRG_LAN'] saveDataset(dataset, params['DATASET_STORE_PATH']) else: logging.info('Reloading and using dataset.') dataset = loadDataset(load_dataset) else: # Load data if load_dataset is None: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Build model set_optimizer = True if params['RELOAD'] == 0 else False clear_dirs = True if params['RELOAD'] == 0 else False # build new model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=set_optimizer, clear_dirs=clear_dirs) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) if params['RELOAD'] > 0: nmt_model = updateModel(nmt_model, params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model.setParams(params) nmt_model.setOptimizer() if params.get('EPOCH_OFFSET') is None: params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) # Store configuration as pkl dict2pkl(params, params['STORE_PATH'] + '/config') # Callbacks callbacks = buildCallbacks(params, nmt_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params.get('LR_DECAY', None), # LR decay parameters 'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True), 'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0), 'lr_gamma': params.get('LR_GAMMA', 0.9), 'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'), 'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0), 'lr_half_life': params.get('LR_HALF_LIFE', 50000), 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0), 'tensorboard': params.get('TENSORBOARD', False), 'tensorboard_params': { 'log_dir': params.get('LOG_DIR', 'tensorboard_logs'), 'histogram_freq': params.get('HISTOGRAM_FREQ', 0), 'batch_size': params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']), 'write_graph': params.get('WRITE_GRAPH', True), 'write_grads': params.get('WRITE_GRADS', False), 'write_images': params.get('WRITE_IMAGES', False), 'embeddings_freq': params.get('EMBEDDINGS_FREQ', 0), 'embeddings_layer_names': params.get('EMBEDDINGS_LAYER_NAMES', None), 'embeddings_metadata': params.get('EMBEDDINGS_METADATA', None), 'label_word_embeddings_with_vocab': params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False), 'word_embeddings_labels': params.get('WORD_EMBEDDINGS_LABELS', None), } } nmt_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logging.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
from keras_wrapper.utils import decode_predictions_beam_search from config import load_parameters from keras_wrapper.extra.read_write import list2file import os import re """## 3. Decoding with a trained Neural Machine Translation Model Now, we'll load from disk the model we just trained and we'll apply it for translating new text. In this case, we want to translate the 'test' split from our dataset. Since we want to translate a new data split ('test') we must add it to the dataset instance, just as we did before (at the first tutorial). In case we also had the refences of the test split and we wanted to evaluate it, we can add it to the dataset. Note that this is not mandatory and we could just predict without evaluating. """ DATA_PATH = os.path.join(os.getcwd(), 'data/PersonaChat/') MODEL_PATH = os.path.join(os.getcwd(), 'models/persona_chat_context_lstm_13_de_layers') dataset = loadDataset(os.path.join(MODEL_PATH, "dataset/Dataset_tutorial_dataset.pkl")) epoch_choice = 17 # Load model nmt_model = loadModel(MODEL_PATH, epoch_choice) params = load_parameters() params_prediction = { 'language': 'en', 'tokenize_f': eval('dataset.' + 'tokenize_basic'), 'beam_size': 6, 'optimized_search': True, 'model_inputs': params['INPUTS_IDS_MODEL'], 'model_outputs': params['OUTPUTS_IDS_MODEL'], 'dataset_inputs': params['INPUTS_IDS_DATASET'],
def sample_ensemble(args, params): from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.model_ensemble import BeamSearchEnsemble from keras_wrapper.cnn_model import loadModel from keras_wrapper.dataset import loadDataset from keras_wrapper.utils import decode_predictions_beam_search logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # For converting predictions into sentences index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] if params.get('APPLY_DETOKENIZATION', False): detokenize_function = eval('dataset.' + params['DETOKENIZATION_METHOD']) params_prediction = dict() params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1) params_prediction['beam_size'] = params.get('BEAM_SIZE', 6) params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False) params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) heuristic = params.get('HEURISTIC', 0) mapping = None if dataset.mapping == dict() else dataset.mapping model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(lambda x: float(x), model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction['predict_on_sets'] = [s] beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, n_best=args.n_best, verbose=args.verbose) if args.n_best: predictions, n_best = beam_searcher.predictBeamSearchNet()[s] else: predictions = beam_searcher.predictBeamSearchNet()[s] n_best = None if params_prediction['pos_unk']: samples = predictions[0] alphas = predictions[1] sources = [ x.strip() for x in open(args.text, 'r').read().split('\n') ] sources = sources[:-1] if len(sources[-1]) == 0 else sources else: samples = predictions alphas = None heuristic = None sources = None predictions = decode_predictions_beam_search(samples, index2word_y, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): predictions = map(detokenize_function, predictions) if args.n_best: n_best_predictions = [] for i, (n_best_preds, n_best_scores, n_best_alphas) in enumerate(n_best): n_best_sample_score = [] for n_best_pred, n_best_score, n_best_alpha in zip( n_best_preds, n_best_scores, n_best_alphas): pred = decode_predictions_beam_search( [n_best_pred], index2word_y, alphas=[n_best_alpha] if params_prediction['pos_unk'] else None, x_text=[sources[i]] if params_prediction['pos_unk'] else None, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): pred = map(detokenize_function, pred) n_best_sample_score.append([i, pred, n_best_score]) n_best_predictions.append(n_best_sample_score) # Store result if args.dest is not None: filepath = args.dest # results file if params.get('SAMPLING_SAVE_MODE', 'list'): list2file(filepath, predictions) if args.n_best: nbest2file(filepath + '.nbest', n_best_predictions) else: raise Exception( 'Only "list" is allowed in "SAMPLING_SAVE_MODE"') else: list2stdout(predictions) if args.n_best: logging.info('Storing n-best sentences in ./' + s + '.nbest') nbest2file('./' + s + '.nbest', n_best_predictions) logging.info('Sampling finished')
def main(): args = parse_args() server_address = ('', args.port) httpd = BaseHTTPServer.HTTPServer(server_address, NMTHandler) if args.config is None: logging.info("Reading parameters from config.py") from config import load_parameters params = load_parameters() else: logging.info("Loading parameters from %s" % str(args.config)) params = pkl2dict(args.config) try: for arg in args.changes: try: k, v = arg.split('=') except ValueError: print 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str( args.changes) exit(1) try: params[k] = ast.literal_eval(v) except ValueError: params[k] = v except ValueError: print 'Error processing arguments: (', k, ",", v, ")" exit(2) dataset = loadDataset(args.dataset) # For converting predictions into sentences # Dataset backwards compatibility bpe_separator = dataset.BPE_separator if hasattr( dataset, "BPE_separator") and dataset.BPE_separator is not None else '@@' # Build BPE tokenizer if necessary if 'bpe' in params['TOKENIZATION_METHOD'].lower(): logger.info('Building BPE') if not dataset.BPE_built: dataset.build_bpe( params.get('BPE_CODES_PATH', params['DATA_ROOT_PATH'] + '/training_codes.joint'), bpe_separator) # Build tokenization function tokenize_f = eval('dataset.' + params.get('TOKENIZATION_METHOD', 'tokenize_none')) detokenize_function = eval( 'dataset.' + params.get('DETOKENIZATION_METHOD', 'detokenize_none')) dataset.build_moses_tokenizer(language=params['SRC_LAN']) dataset.build_moses_detokenizer(language=params['TRG_LAN']) tokenize_general = dataset.tokenize_moses detokenize_general = dataset.detokenize_moses params_prediction = dict() params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1) params_prediction['beam_size'] = params.get('BEAM_SIZE', 6) params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False) params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['heuristic'] = params.get('HEURISTIC', 0) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) # Manage pos_unk strategies if params['POS_UNK']: mapping = None if dataset.mapping == dict() else dataset.mapping else: mapping = None if args.online: logging.info('Loading models from %s' % str(args.models)) model_instances = [ TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'] + '_' + str(i), vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=False) for i in range(len(args.models)) ] models = [ updateModel(model, path, -1, full_path=True) for (model, path) in zip(model_instances, args.models) ] # Set additional inputs to models if using a custom loss function params['USE_CUSTOM_LOSS'] = True if 'PAS' in params[ 'OPTIMIZER'] else False if params['N_BEST_OPTIMIZER']: logging.info('Using N-best optimizer') models = build_online_models(models, params) online_trainer = OnlineTrainer(models, dataset, None, None, params_training, verbose=args.verbose) else: models = [loadModel(m, -1, full_path=True) for m in args.models] params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Get word2index and index2word dictionaries index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] word2index_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['words2idx'] index2word_x = dataset.vocabulary[params['INPUTS_IDS_DATASET'] [0]]['idx2words'] word2index_x = dataset.vocabulary[params['INPUTS_IDS_DATASET'] [0]]['words2idx'] excluded_words = None interactive_beam_searcher = NMTSampler(models, dataset, params_prediction, tokenize_f, detokenize_function, tokenize_general, detokenize_general, mapping=mapping, word2index_x=word2index_x, word2index_y=word2index_y, index2word_y=index2word_y, excluded_words=excluded_words, verbose=args.verbose) # Compile Theano sampling function by generating a fake sample # TODO: Find a better way of doing this print "Compiling sampler..." interactive_beam_searcher.generate_sample('i') httpd.sampler = interactive_beam_searcher print 'Server starting at localhost:' + str(args.port) httpd.serve_forever()
def build_dataset(params): if params['REBUILD_DATASET']: # We build a new dataset instance if (params['VERBOSE'] > 0): silence = False logging.info('Building ' + params['DATASET_NAME'] + ' dataset') else: silence = True base_path = params['DATA_ROOT_PATH'] name = params['DATASET_NAME'] ds = Dataset(name, base_path, silence=silence) ##### OUTPUT DATA # Let's load the train, val and test splits of the descriptions (outputs) # the files include a description per line. In this dataset a variable number # of descriptions per video are provided. ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['train'], 'train', type='text', id=params['OUTPUTS_IDS_DATASET'][0], build_vocabulary=True, tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], pad_on_batch=True, max_text_len=params['MAX_OUTPUT_TEXT_LEN'], sample_weights=params['SAMPLE_WEIGHTS'], min_occ=params['MIN_OCCURRENCES_VOCAB']) ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['val'], 'val', type='text', id=params['OUTPUTS_IDS_DATASET'][0], build_vocabulary=True, pad_on_batch=True, tokenization=params['TOKENIZATION_METHOD'], sample_weights=params['SAMPLE_WEIGHTS'], max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], min_occ=params['MIN_OCCURRENCES_VOCAB']) ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['test'], 'test', type='text', id=params['OUTPUTS_IDS_DATASET'][0], build_vocabulary=True, pad_on_batch=True, tokenization=params['TOKENIZATION_METHOD'], sample_weights=params['SAMPLE_WEIGHTS'], max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'], min_occ=params['MIN_OCCURRENCES_VOCAB']) ##### INPUT DATA # Let's load the associated videos (inputs) # we must take into account that in this dataset we have a different number of sentences per video, # for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list # containing the number of captions in each video. num_captions_train = np.load( base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['train']) num_captions_val = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['val']) num_captions_test = np.load(base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['test']) for feat_type in params['FEATURE_NAMES']: for split, num_cap in zip( ['train', 'val', 'test'], [num_captions_train, num_captions_val, num_captions_test]): list_files = base_path + '/' + params['FRAMES_LIST_FILES'][ split] % feat_type counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][ split] % feat_type ds.setInput([list_files, counts_files], split, type=params['INPUT_DATA_TYPE'], id=params['INPUTS_IDS_DATASET'][0], repeat_set=num_cap, max_video_len=params['NUM_FRAMES'], feat_len=params['IMG_FEAT_SIZE']) if len(params['INPUTS_IDS_DATASET']) > 1: ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'], 'train', type='text', id=params['INPUTS_IDS_DATASET'][-1], required=False, tokenization=params['TOKENIZATION_METHOD'], pad_on_batch=True, build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], offset=1, fill=params['FILL'], max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE'], min_occ=params['MIN_OCCURRENCES_VOCAB']) ds.setInput(None, 'val', type='ghost', id=params['INPUTS_IDS_DATASET'][-1], required=False) ds.setInput(None, 'test', type='ghost', id=params['INPUTS_IDS_DATASET'][-1], required=False) # Process dataset for keeping only one caption per video and storing the rest in a dict() with the following format: # ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN] keep_n_captions(ds, repeat=[num_captions_val, num_captions_test], n=1, set_names=['val', 'test']) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATASET_STORE_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl') return ds
def build_dataset(params): if params['REBUILD_DATASET']: # We build a new dataset instance if (params['VERBOSE'] > 0): silence = False logging.info('Building ' + params['DATASET_NAME'] + ' dataset') else: silence = True base_path = params['DATA_ROOT_PATH'] + '/' name = params['DATASET_NAME'] ds = Dataset(name, base_path, silence=silence) ##### OUTPUT DATA # Let's load the train, val and test splits of the target language sentences (outputs) # the files include a sentence per line. print params['CLASS_FILES'] for split in params['CLASS_FILES'].keys(): ds.setOutput(params['CLASS_FILES'][split], split, type='categorical', id=params['OUTPUTS_IDS_DATASET'][0], sample_weights=params['SAMPLE_WEIGHTS']) # INPUT DATA for split in params['TEXT_FILES'].keys(): if split == 'train': build_vocabulary = True else: build_vocabulary = False for i in range(len(params['INPUTS_IDS_DATASET'])): ds.setInput(params['TEXT_FILES'][split][i], split, type='text', id=params['INPUTS_IDS_DATASET'][i], pad_on_batch=params['PAD_ON_BATCH'], tokenization=params['TOKENIZATION_METHOD'], build_vocabulary=build_vocabulary, fill=params['FILL'], max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'], min_occ=params['MIN_OCCURRENCES_VOCAB']) for i in range(len(params['INPUTS_IDS_DATASET'])): if 'semisupervised' in params['MODE']: ds.setInput(params['POOL_FILENAME'][i], 'test', type='text', id=params['INPUTS_IDS_DATASET'][i], pad_on_batch=params['PAD_ON_BATCH'], tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'], min_occ=params['MIN_OCCURRENCES_VOCAB']) keep_n_captions(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS']) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATASET_STORE_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl') return ds
def build_dataset(params): if params['REBUILD_DATASET']: # We build a new dataset instance if (params['VERBOSE'] > 0): silence = False logging.info('Building ' + params['DATASET_NAME'] + ' dataset') else: silence = True base_path = params['DATA_ROOT_PATH'] name = params['DATASET_NAME'] ds = Dataset(name, base_path, silence=silence) ##### INPUT DATA # Let's load the images (inputs) ### IMAGES list_train = base_path + '/' + params['IMG_FILES']['train'][0] ds.setInput(list_train, 'train', type='raw-image', id=params['INPUTS_IDS_DATASET'][0], img_size=params['IMG_SIZE'], img_size_crop=params['IMG_CROP_SIZE'], use_RGB=params['RGB']) if 'val' in params['IMG_FILES'] and params['IMG_FILES']['val']: list_val = base_path + '/' + params['IMG_FILES']['val'][0] ds.setInput(list_val, 'val', type='raw-image', id=params['INPUTS_IDS_DATASET'][0], img_size=params['IMG_SIZE'], img_size_crop=params['IMG_CROP_SIZE'], use_RGB=params['RGB']) if 'test' in params['IMG_FILES'] and params['IMG_FILES']['test']: list_test = base_path + '/' + params['IMG_FILES']['test'][0] ds.setInput(list_test, 'test', type='raw-image', id=params['INPUTS_IDS_DATASET'][0], img_size=params['IMG_SIZE'], img_size_crop=params['IMG_CROP_SIZE'], use_RGB=params['RGB']) # Train mean if params['MEAN_IMAGE']: # if params['NORMALIZE']: # params['MEAN_IMAGE'] = [m / 255. for m in params['MEAN_IMAGE']] ds.setTrainMean(params['MEAN_IMAGE'], params['INPUTS_IDS_DATASET'][0]) else: ds.calculateTrainMean(params['INPUTS_IDS_DATASET'][0]) ##### OUTPUT DATA if params['TYPE_OUT'] == '3DLabel': # Set list of classes (strings) ds.setClasses(base_path + '/' + params['CLASSES_PATH'], params['OUTPUTS_IDS_DATASET'][0]) elif params['TYPE_OUT'] == '3DSemanticLabel': # Set list of classes (strings) classes_names = [] with open(base_path + '/' + params['CLASSES_PATH'], 'r') as file: for line in file: line = line.rstrip('\n').split(',')[0] classes_names.append(line) ds.setClasses(classes_names, params['OUTPUTS_IDS_DATASET'][0]) ds.setSemanticClasses(base_path + '/' + params['CLASSES_PATH'], params['OUTPUTS_IDS_DATASET'][0]) ### 3DLabels or 3DSemanticLabels ds.setOutput(base_path + '/' + params['IMG_FILES']['train'][1], 'train', type=params['TYPE_OUT'], id=params['OUTPUTS_IDS_DATASET'][0], associated_id_in=params['INPUTS_IDS_DATASET'][0], num_poolings=params['NUM_MODEL_POOLINGS']) if 'val' in params['IMG_FILES'] and params['IMG_FILES']['val']: ds.setOutput(base_path + '/' + params['IMG_FILES']['val'][1], 'val', type=params['TYPE_OUT'], id=params['OUTPUTS_IDS_DATASET'][0], associated_id_in=params['INPUTS_IDS_DATASET'][0], num_poolings=params['NUM_MODEL_POOLINGS']) if 'test' in params['IMG_FILES'] and params['IMG_FILES']['test']: ds.setOutput(base_path + '/' + params['IMG_FILES']['test'][1], 'test', type=params['TYPE_OUT'], id=params['OUTPUTS_IDS_DATASET'][0], associated_id_in=params['INPUTS_IDS_DATASET'][0], num_poolings=params['NUM_MODEL_POOLINGS']) if params['DISCARD_CLASSES']: weights = np.ones((params['NUM_CLASSES'], )) for c in params['DISCARD_CLASSES']: weights[c] = 0.0 ds.extra_variables['class_weights_' + params['OUTPUTS_IDS_DATASET'][0]] = weights if params['WEIGHT_CLASSES']: weights = params['WEIGHT_CLASSES'] ds.extra_variables['class_weights_' + params['OUTPUTS_IDS_DATASET'][0]] = weights ### Single multi-label if params['APPLY_MULTILABEL_CLASSIFICATION']: n_classes = len(ds.classes[params['OUTPUTS_IDS_DATASET'][0]]) multilabel = convert3DLabels2multilabel( base_path + '/' + params['IMG_FILES']['train'][1], n_classes) ds.setOutput(multilabel, 'train', type='binary', id=params['OUTPUTS_IDS_DATASET'][1]) if 'val' in params['IMG_FILES'] and params['IMG_FILES']['val']: multilabel = convert3DLabels2multilabel( base_path + '/' + params['IMG_FILES']['val'][1], n_classes) ds.setOutput(multilabel, 'val', type='binary', id=params['OUTPUTS_IDS_DATASET'][1]) if 'test' in params['IMG_FILES'] and params['IMG_FILES']['test']: multilabel = convert3DLabels2multilabel( base_path + '/' + params['IMG_FILES']['test'][1], n_classes) ds.setOutput(multilabel, 'test', type='binary', id=params['OUTPUTS_IDS_DATASET'][1]) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATASET_STORE_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl') return ds
def apply_NMT_model(params, load_dataset=None): """ Sample from a previously trained model. :param params: Dictionary of network hyperparameters. :param load_dataset: Load dataset from file or build it from the parameters. :return: None """ # Load data if load_dataset is None: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Load model nmt_model = loadModel(params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) # Evaluate training extra_vars = { 'language': params.get('TRG_LAN', 'en'), 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'tokenize_f': eval('dataset.' + params['TOKENIZATION_METHOD']), 'detokenize_f': eval('dataset.' + params['DETOKENIZATION_METHOD']), 'apply_detokenization': params['APPLY_DETOKENIZATION'], 'tokenize_hypotheses': params['TOKENIZE_HYPOTHESES'], 'tokenize_references': params['TOKENIZE_REFERENCES'], } input_text_id = params['INPUTS_IDS_DATASET'][0] vocab_x = dataset.vocabulary[input_text_id]['idx2words'] vocab_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] if params['BEAM_SEARCH']: extra_vars['beam_size'] = params.get('BEAM_SIZE', 6) extra_vars['state_below_index'] = params.get('BEAM_SEARCH_COND_INPUT', -1) extra_vars['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 30) extra_vars['optimized_search'] = params.get('OPTIMIZED_SEARCH', True) extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL'] extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL'] extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET'] extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] extra_vars['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) extra_vars['search_pruning'] = params.get('SEARCH_PRUNING', False) extra_vars['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) extra_vars['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) extra_vars['length_penalty'] = params.get('LENGTH_PENALTY', False) extra_vars['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0) extra_vars['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) extra_vars['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) extra_vars['pos_unk'] = params['POS_UNK'] extra_vars['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) extra_vars['output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) extra_vars['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) extra_vars['output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) extra_vars['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) if params['POS_UNK']: extra_vars['heuristic'] = params['HEURISTIC'] if params['HEURISTIC'] > 0: extra_vars['mapping'] = dataset.mapping for s in params["EVAL_ON_SETS"]: extra_vars[s] = dict() extra_vars[s]['references'] = dataset.extra_variables[s][ params['OUTPUTS_IDS_DATASET'][0]] callback_metric = PrintPerformanceMetricOnEpochEndOrEachNUpdates( nmt_model, dataset, gt_id=params['OUTPUTS_IDS_DATASET'][0], metric_name=params['METRICS'], set_name=params['EVAL_ON_SETS'], batch_size=params['BATCH_SIZE'], each_n_epochs=params['EVAL_EACH'], extra_vars=extra_vars, reload_epoch=params['RELOAD'], is_text=True, input_text_id=input_text_id, save_path=nmt_model.model_path, index2word_y=vocab_y, index2word_x=vocab_x, sampling_type=params['SAMPLING'], beam_search=params['BEAM_SEARCH'], start_eval_on_epoch=params['START_EVAL_ON_EPOCH'], write_samples=True, write_type=params['SAMPLING_SAVE_MODE'], eval_on_epochs=params['EVAL_EACH_EPOCHS'], save_each_evaluation=False, verbose=params['VERBOSE']) callback_metric.evaluate( params['RELOAD'], counter_name='epoch' if params['EVAL_EACH_EPOCHS'] else 'update')
def build_dataset(params): if params['REBUILD_DATASET']: # We build a new dataset instance if (params['VERBOSE'] > 0): silence = False logging.info('Building ' + params['DATASET_NAME'] + ' dataset') else: silence = True base_path = params['DATA_ROOT_PATH'] name = params['DATASET_NAME'] ds = Dataset(name, base_path, silence=silence) max_text_len = params['MAX_INPUT_TEXT_LEN'] ##### INPUT DATA ### QUESTIONS ds.setInput(base_path + '/' + params['QST_FILES']['train'][0], 'train', type='text', id=params['INPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], build_vocabulary=True, fill=params['FILL'], max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'], repeat_set=params['REPEAT_QST']) ds.setInput(base_path + '/' + params['QST_FILES']['val'][0], 'val', type='text', id=params['INPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'], repeat_set=params['REPEAT_QST']) ds.setInput(base_path + '/' + params['QST_FILES']['test'][0], 'test', type='text', id=params['INPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'], repeat_set=params['REPEAT_QST']) ### QUESTIONS' associated IDs ds.setInput(base_path + '/' + params['QST_FILES']['train'][1], 'train', type='id', id=params['INPUTS_IDS_DATASET'][0] + '_ids', repeat_set=params['REPEAT_QST']) ds.setInput(base_path + '/' + params['QST_FILES']['val'][1], 'val', type='id', id=params['INPUTS_IDS_DATASET'][0] + '_ids', repeat_set=params['REPEAT_QST']) ds.setInput(base_path + '/' + params['QST_FILES']['test'][1], 'test', type='id', id=params['INPUTS_IDS_DATASET'][0] + '_ids', repeat_set=params['REPEAT_QST']) ### IMAGES ds.setInput(base_path + '/' + params['IMG_FILES']['train'][0], 'train', type='image-features', id=params['INPUTS_IDS_DATASET'][1], feat_len=params['IMG_FEAT_SIZE'], repeat_set=params['REPEAT_IMG']) ds.setInput(base_path + '/' + params['IMG_FILES']['val'][0], 'val', type='image-features', id=params['INPUTS_IDS_DATASET'][1], feat_len=params['IMG_FEAT_SIZE'], repeat_set=params['REPEAT_IMG']) ds.setInput(base_path + '/' + params['IMG_FILES']['test'][0], 'test', type='image-features', id=params['INPUTS_IDS_DATASET'][1], feat_len=params['IMG_FEAT_SIZE'], repeat_set=params['REPEAT_IMG']) ### IMAGES' associated IDs ds.setInput(base_path + '/' + params['IMG_FILES']['train'][1], 'train', type='id', id=params['INPUTS_IDS_DATASET'][1] + '_ids', repeat_set=params['REPEAT_IMG']) ds.setInput(base_path + '/' + params['IMG_FILES']['val'][1], 'val', type='id', id=params['INPUTS_IDS_DATASET'][1] + '_ids', repeat_set=params['REPEAT_IMG']) ds.setInput(base_path + '/' + params['IMG_FILES']['test'][1], 'test', type='id', id=params['INPUTS_IDS_DATASET'][1] + '_ids', repeat_set=params['REPEAT_IMG']) ##### OUTPUT DATA ### ANSWERS ds.setOutput(base_path + '/' + params['ANS_FILES']['train'][0], 'train', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], build_vocabulary=True, fill=params['FILL'], max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE']) ds.setOutput(base_path + '/' + params['ANS_FILES']['val'][0], 'val', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE']) if 'test' in params['ANS_FILES']: ds.setOutput(base_path + '/' + params['ANS_FILES']['test'][0], 'test', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'], max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE']) # Load extra variables (we need the original path to questions and annotations for VQA evaluation) ds.extra_variables['train'] = dict() ds.extra_variables['val'] = dict() ds.extra_variables['test'] = dict() ds.extra_variables['train'][ 'quesFile'] = base_path + '/' + params['QST_FILES']['train'][2] ds.extra_variables['val'][ 'quesFile'] = base_path + '/' + params['QST_FILES']['val'][2] ds.extra_variables['test'][ 'quesFile'] = base_path + '/' + params['QST_FILES']['test'][2] ds.extra_variables['train'][ 'annFile'] = base_path + '/' + params['ANS_FILES']['train'][1] ds.extra_variables['val'][ 'annFile'] = base_path + '/' + params['ANS_FILES']['val'][1] if 'test' in params['ANS_FILES']: ds.extra_variables['test'][ 'annFile'] = base_path + '/' + params['ANS_FILES']['tes'][1] # Remove all samples of the train set not belonging to the top classes chosen if params['KEEP_TOP_ANSWERS']: ds.keepTopOutputs('train', params['OUTPUTS_IDS_DATASET'][0], params['OUTPUT_VOCABULARY_SIZE']) # Filter top K answers per question-image pair if params['FILTER_ANSWERS']: filter_k_frequent_answers(ds, params) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATA_ROOT_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATA_ROOT_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl') return ds
try: for arg in args.changes: try: k, v = arg.split('=') except ValueError: print 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str( args.changes) exit(1) try: params[k] = ast.literal_eval(v) except ValueError: params[k] = v except ValueError: print 'Error processing arguments: (', k, ",", v, ")" exit(2) dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # For converting predictions into sentences index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] if params.get('APPLY_DETOKENIZATION', False):
def bpe_loading(args): logging.info("Using an ensemble of %d models" % len(args["models"])) models = [loadModel(m, -1, full_path=True) for m in args["models"]] dataset = loadDataset(args["dataset"]) return models, dataset
def main(): args = parse_args() server_address = (args.address, args.port) httpd = HTTPServer(server_address, NMTHandler) logger.setLevel(args.logging_level) parameters = load_parameters() if args.config is not None: logger.info("Loading parameters from %s" % str(args.config)) parameters = update_parameters(parameters, pkl2dict(args.config)) if args.online: online_parameters = load_parameters_online() parameters = update_parameters(parameters, online_parameters) try: for arg in args.changes: try: k, v = arg.split('=') except ValueError: print( 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str(args.changes)) exit(1) try: parameters[k] = ast.literal_eval(v) except ValueError: parameters[k] = v except ValueError: print('Error processing arguments: (', k, ",", v, ")") exit(2) dataset = loadDataset(args.dataset) # For converting predictions into sentences # Dataset backwards compatibility bpe_separator = dataset.BPE_separator if hasattr( dataset, "BPE_separator") and dataset.BPE_separator is not None else '@@' # Build BPE tokenizer if necessary if 'bpe' in parameters['TOKENIZATION_METHOD'].lower(): logger.info('Building BPE') if not dataset.BPE_built: dataset.build_bpe(parameters.get( 'BPE_CODES_PATH', parameters['DATA_ROOT_PATH'] + '/training_codes.joint'), separator=bpe_separator) # Build tokenization function tokenize_f = eval('dataset.' + parameters.get('TOKENIZATION_METHOD', 'tokenize_bpe')) detokenize_function = eval( 'dataset.' + parameters.get('DETOKENIZATION_METHOD', 'detokenize_bpe')) dataset.build_moses_tokenizer(language=parameters['SRC_LAN']) dataset.build_moses_detokenizer(language=parameters['TRG_LAN']) tokenize_general = dataset.tokenize_moses detokenize_general = dataset.detokenize_moses # Prediction parameters params_prediction = dict() params_prediction['max_batch_size'] = parameters.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = parameters.get( 'PARALLEL_LOADERS', 1) params_prediction['beam_size'] = parameters.get('BEAM_SIZE', 6) params_prediction['maxlen'] = parameters.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = parameters['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = parameters['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = parameters['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = parameters['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = parameters['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = parameters.get( 'SEARCH_PRUNING', False) params_prediction['normalize_probs'] = True params_prediction['alpha_factor'] = parameters.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = True params_prediction['length_penalty'] = True params_prediction['length_norm_factor'] = parameters.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = parameters.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = parameters.get('POS_UNK', False) params_prediction['heuristic'] = parameters.get('HEURISTIC', 0) params_prediction['state_below_index'] = -1 params_prediction['output_text_index'] = 0 params_prediction['state_below_maxlen'] = -1 if parameters.get( 'PAD_ON_BATCH', True) else parameters.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = parameters.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = parameters.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = parameters.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = parameters.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = parameters.get( 'ATTEND_ON_OUTPUT', 'transformer' in parameters['MODEL_TYPE'].lower()) # Manage pos_unk strategies if parameters['POS_UNK']: mapping = None if dataset.mapping == dict() else dataset.mapping else: mapping = None if 'transformer' in parameters['MODEL_TYPE'].lower(): params_prediction['pos_unk'] = False params_prediction['coverage_penalty'] = False # Training parameters parameters_training = dict() if args.online: logger.info('Loading models from %s' % str(args.models)) parameters_training = { # Traning parameters 'n_epochs': parameters['MAX_EPOCH'], 'shuffle': False, 'loss': parameters.get('LOSS', 'categorical_crossentropy'), 'batch_size': parameters.get('BATCH_SIZE', 1), 'homogeneous_batches': False, 'optimizer': parameters.get('OPTIMIZER', 'SGD'), 'lr': parameters.get('LR', 0.1), 'lr_decay': parameters.get('LR_DECAY', None), 'lr_gamma': parameters.get('LR_GAMMA', 1.), 'epochs_for_save': -1, 'verbose': args.verbose, 'eval_on_sets': parameters.get('EVAL_ON_SETS_KERAS', None), 'n_parallel_loaders': parameters['PARALLEL_LOADERS'], 'extra_callbacks': [], # callbacks, 'reload_epoch': parameters['RELOAD'], 'epoch_offset': parameters['RELOAD'], 'data_augmentation': parameters['DATA_AUGMENTATION'], 'patience': parameters.get('PATIENCE', 0), 'metric_check': parameters.get('STOP_METRIC', None), 'eval_on_epochs': parameters.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': parameters.get('EVAL_EACH', 1), 'start_eval_on_epoch': parameters.get('START_EVAL_ON_EPOCH', 0), 'additional_training_settings': { 'k': parameters.get('K', 1), 'tau': parameters.get('TAU', 1), 'lambda': parameters.get('LAMBDA', 0.5), 'c': parameters.get('C', 0.5), 'd': parameters.get('D', 0.5) } } model_instances = [ TranslationModel( parameters, model_type=parameters['MODEL_TYPE'], verbose=parameters['VERBOSE'], model_name=parameters['MODEL_NAME'] + '_' + str(i), vocabularies=dataset.vocabulary, store_path=parameters['STORE_PATH'], set_optimizer=False) for i in range(len(args.models)) ] models = [ updateModel(model, path, -1, full_path=True) for (model, path) in zip(model_instances, args.models) ] else: models = [loadModel(m, -1, full_path=True) for m in args.models] for nmt_model in models: nmt_model.setParams(parameters) nmt_model.setOptimizer() parameters['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ parameters['INPUTS_IDS_DATASET'][0]] parameters['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ parameters['OUTPUTS_IDS_DATASET'][0]] # Get word2index and index2word dictionaries index2word_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] word2index_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET'] [0]]['words2idx'] index2word_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET'] [0]]['idx2words'] word2index_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET'] [0]]['words2idx'] excluded_words = None interactive_beam_searcher = NMTSampler(models, dataset, parameters, params_prediction, parameters_training, tokenize_f, detokenize_function, tokenize_general, detokenize_general, mapping=mapping, word2index_x=word2index_x, word2index_y=word2index_y, index2word_y=index2word_y, eos_symbol=args.eos_symbol, excluded_words=excluded_words, online=args.online, verbose=args.verbose) httpd.sampler = interactive_beam_searcher logger.info('Server starting at %s' % str(server_address)) httpd.serve_forever()
def sample_ensemble(args, params): """ Use several translation models for obtaining predictions from a source text file. :param argparse.Namespace args: Arguments given to the method: * dataset: Dataset instance with data. * text: Text file with source sentences. * splits: Splits to sample. Should be already included in the dataset object. * dest: Output file to save scores. * weights: Weight given to each model in the ensemble. You should provide the same number of weights than models. By default, it applies the same weight to each model (1/N). * n_best: Write n-best list (n = beam size). * config: Config .pkl for loading the model configuration. If not specified, hyperparameters are read from config.py. * models: Path to the models. * verbose: Be verbose or not. :param params: parameters of the translation model. """ from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.model_ensemble import BeamSearchEnsemble from keras_wrapper.cnn_model import loadModel from keras_wrapper.dataset import loadDataset from keras_wrapper.utils import decode_predictions_beam_search logger.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # For converting predictions into sentences index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words'] if params.get('APPLY_DETOKENIZATION', False): detokenize_function = eval('dataset.' + params['DETOKENIZATION_METHOD']) params_prediction = dict() params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1) params_prediction['beam_size'] = params.get('BEAM_SIZE', 6) params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False) params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get('COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get('MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get('MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get('MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get('MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get('ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) params_prediction['glossary'] = params.get('GLOSSARY', None) heuristic = params.get('HEURISTIC', 0) mapping = None if dataset.mapping == dict() else dataset.mapping model_weights = args.weights if args.glossary is not None: glossary = pkl2dict(args.glossary) elif params_prediction['glossary'] is not None: glossary = pkl2dict(params_prediction['glossary']) else: glossary = None if model_weights: assert len(model_weights) == len( models), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = list(map(float, model_weights)) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction['predict_on_sets'] = [s] beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, n_best=args.n_best, verbose=args.verbose) predictions = beam_searcher.predictBeamSearchNet()[s] samples = predictions['samples'] alphas = predictions['alphas'] if params_prediction['pos_unk'] else None if params_prediction['pos_unk']: sources = [x.strip() for x in open(args.text, 'r').read().split('\n')] sources = sources[:-1] if len(sources[-1]) == 0 else sources else: sources = None decoded_predictions = decode_predictions_beam_search(samples, index2word_y, glossary=glossary, alphas=alphas, x_text=sources, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): decoded_predictions = list(map(detokenize_function, decoded_predictions)) if args.n_best: n_best_predictions = [] for i, (n_best_preds, n_best_scores, n_best_alphas) in enumerate(predictions['n_best']): n_best_sample_score = [] for n_best_pred, n_best_score, n_best_alpha in zip(n_best_preds, n_best_scores, n_best_alphas): pred = decode_predictions_beam_search([n_best_pred], index2word_y, glossary=glossary, alphas=[n_best_alpha] if params_prediction[ 'pos_unk'] else None, x_text=[sources[i]] if params_prediction['pos_unk'] else None, heuristic=heuristic, mapping=mapping, verbose=args.verbose) # Apply detokenization function if needed if params.get('APPLY_DETOKENIZATION', False): pred = list(map(detokenize_function, pred)) n_best_sample_score.append([i, pred, n_best_score]) n_best_predictions.append(n_best_sample_score) # Store result if args.dest is not None: filepath = args.dest # results file if params.get('SAMPLING_SAVE_MODE', 'list'): list2file(filepath, decoded_predictions) if args.n_best: nbest2file(filepath + '.nbest', n_best_predictions) else: raise Exception('Only "list" is allowed in "SAMPLING_SAVE_MODE"') else: list2stdout(decoded_predictions) if args.n_best: logger.info('Storing n-best sentences in ./' + s + '.nbest') nbest2file('./' + s + '.nbest', n_best_predictions) logger.info('Sampling finished')
def build_dataset(params): """ Builds (or loads) a Dataset instance. :param params: Parameters specifying Dataset options :return: Dataset object """ if params['REBUILD_DATASET']: # We build a new dataset instance if params['VERBOSE'] > 0: silence = False logging.info('Building ' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + ' dataset') else: silence = True base_path = params['DATA_ROOT_PATH'] name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[ 'TRG_LAN'] ds = Dataset(name, base_path, silence=silence) # OUTPUT DATA # Let's load the train, val and test splits of the target language sentences (outputs) # the files include a sentence per line. ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] + params['TRG_LAN'], 'train', type='text', id=params['OUTPUTS_IDS_DATASET'][0], tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), build_vocabulary=True, pad_on_batch=params.get('PAD_ON_BATCH', True), sample_weights=params.get('SAMPLE_WEIGHTS', True), fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0)) if params.get('ALIGN_FROM_RAW', True) and not params.get('HOMOGENEOUS_BATCHES', False): ds.setRawOutput(base_path + '/' + params['TEXT_FILES']['train'] + params['TRG_LAN'], 'train', type='file-name', id='raw_' + params['OUTPUTS_IDS_DATASET'][0]) for split in ['val', 'test']: if params['TEXT_FILES'].get(split) is not None: ds.setOutput(base_path + '/' + params['TEXT_FILES'][split] + params['TRG_LAN'], split, type='text', id=params['OUTPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), sample_weights=params.get('SAMPLE_WEIGHTS', True), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0)) if params.get('ALIGN_FROM_RAW', True) and not params.get( 'HOMOGENEOUS_BATCHES', False): ds.setRawOutput( base_path + '/' + params['TEXT_FILES'][split] + params['TRG_LAN'], split, type='file-name', id='raw_' + params['OUTPUTS_IDS_DATASET'][0]) # INPUT DATA # We must ensure that the 'train' split is the first (for building the vocabulary) for split in ['train', 'val', 'test']: if params['TEXT_FILES'].get(split) is not None: if split == 'train': build_vocabulary = True else: build_vocabulary = False ds.setInput(base_path + '/' + params['TEXT_FILES'][split] + params['SRC_LAN'], split, type='text', id=params['INPUTS_IDS_DATASET'][0], pad_on_batch=params.get('PAD_ON_BATCH', True), tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), build_vocabulary=build_vocabulary, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_INPUT_TEXT_LEN', 70), max_words=params.get('INPUT_VOCABULARY_SIZE', 0), min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB', 0)) if len(params['INPUTS_IDS_DATASET']) > 1: if 'train' in split: ds.setInput( base_path + '/' + params['TEXT_FILES'][split] + params['TRG_LAN'], split, type='text', id=params['INPUTS_IDS_DATASET'][1], required=False, tokenization=params.get('TOKENIZATION_METHOD', 'tokenize_none'), pad_on_batch=params.get('PAD_ON_BATCH', True), build_vocabulary=params['OUTPUTS_IDS_DATASET'][0], offset=1, fill=params.get('FILL', 'end'), max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70), max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0)) else: ds.setInput(None, split, type='ghost', id=params['INPUTS_IDS_DATASET'][-1], required=False) if params.get('ALIGN_FROM_RAW', True) and not params.get( 'HOMOGENEOUS_BATCHES', False): ds.setRawInput(base_path + '/' + params['TEXT_FILES'][split] + params['SRC_LAN'], split, type='file-name', id='raw_' + params['INPUTS_IDS_DATASET'][0]) if params.get('POS_UNK', False): if params.get('HEURISTIC', 0) > 0: ds.loadMapping(params['MAPPING']) # If we had multiple references per sentence keep_n_captions(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS']) # We have finished loading the dataset, now we can store it for using it in the future saveDataset(ds, params['DATASET_STORE_PATH']) else: # We can easily recover it with a single line ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') return ds
def parse_args(): parser = argparse.ArgumentParser("Minimizes a dataset by removing the data stored in it: Tranining, development and test. " "The rest of parameters are kept." "Useful for reloading datasets with new data.") parser.add_argument("-d", "--dataset", required=True, help="Stored instance of the dataset") parser.add_argument("-o", "--output", help="Output dataset file.", default="") return parser.parse_args() if __name__ == "__main__": args = parse_args() # Load dataset ds = loadDataset(args.dataset) # Reinitialize values to empty ds.loaded_train = [False, False] ds.loaded_val = [False, False] ds.loaded_test = [False, False] ds.loaded_raw_train = [False, False] ds.loaded_raw_val = [False, False] ds.loaded_raw_test = [False, False] ds.len_train = 0 ds.len_val = 0 ds.len_test = 0 # Remove data for key in ds.X_train.keys(): ds.X_train[key] = None