def test_load_dataset():
    params = load_parameters()
    ds = loadDataset('./Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl')
    assert isinstance(ds, Dataset)
    assert isinstance(ds.vocabulary, dict)
    assert ds.vocabulary.keys() >= 3
    for voc in ds.vocabulary:
        assert len(ds.vocabulary[voc].keys()) == 2
Пример #2
0
 def test_load_dataset(self):
     params = load_parameters()
     ds = loadDataset('./Dataset_'+ params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl')
     self.assertIsInstance(ds, Dataset)
     self.assertIsInstance(ds.vocabulary, dict)
     self.assertGreaterEqual(ds.vocabulary.keys(), 3)
     for voc in ds.vocabulary:
         self.assertEqual(len(ds.vocabulary[voc].keys()), 2)
def build_dataset(params):
    
    if params['REBUILD_DATASET']: # We build a new dataset instance
        if(params['VERBOSE'] > 0):
            silence=False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence=True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)

        ##### INPUT DATA
        # Let's load the associated images (inputs)
        num_cap = 1 # We only extract one feature vector per image
        list_train = base_path + '/' + params['IMG_FILES']['train'][0]
        list_val = base_path + '/' + params['IMG_FILES']['val'][0]
        list_test = base_path + '/' + params['IMG_FILES']['test'][0]
        ds.setInput(list_train, 'train',
                    type='raw-image', id=params['INPUTS_IDS_DATASET'][0],
                    img_size=params['IMG_SIZE'], img_size_crop=params['IMG_CROP_SIZE'],
                    repeat_set=num_cap)
        ds.setInput(list_val, 'val',
                    type='raw-image', id=params['INPUTS_IDS_DATASET'][0],
                    img_size=params['IMG_SIZE'], img_size_crop=params['IMG_CROP_SIZE'],
                    repeat_set=num_cap)
        ds.setInput(list_test, 'test',
                    type='raw-image', id=params['INPUTS_IDS_DATASET'][0],
                    img_size=params['IMG_SIZE'], img_size_crop=params['IMG_CROP_SIZE'],
                    repeat_set=num_cap)
        ### IMAGES' associated IDs
        ds.setInput(base_path + '/' + params['IMG_FILES']['train'][1], 'train',
                    type='id', id=params['INPUTS_IDS_DATASET'][0] + '_ids',
                    repeat_set=num_cap)
        ds.setInput(base_path + '/' + params['IMG_FILES']['val'][1], 'val',
                    type='id', id=params['INPUTS_IDS_DATASET'][0] + '_ids',
                    repeat_set=num_cap)
        ds.setInput(base_path + '/' + params['IMG_FILES']['test'][1], 'test',
                    type='id', id=params['INPUTS_IDS_DATASET'][0] + '_ids',
                    repeat_set=num_cap)
        # Train mean
        ds.setTrainMean(params['MEAN_IMAGE'], params['INPUTS_IDS_DATASET'][0])

        ###### OUTPUT DATA: None

        # Process dataset for keeping only one caption per image and storing the rest in a dict() with the following format:
        #        ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN]
        #keep_n_captions(ds, repeat=[1, 1], n=1, set_names=['val','test'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH']+'/Dataset_'+params['DATASET_NAME']+'.pkl')

    return ds
Пример #4
0
def loadMSVD():
    logging.info('Loading MSVD dataset')

    # Build basic dataset structure
    #    we assign it a name and the path were the images are stored

    base_path = '/media/HDD_2TB/DATASETS/MSVD/'
    name = 'MSVD_VideoDescription'
    ds = Dataset(name, base_path)
    max_text_len = 35

    # Let's load the train, val and test splits of the descriptions (outputs)
    #    the files include a description per line. In this dataset a variable number
    #    of descriptions per video are provided.

    ds.setOutput(base_path + 'train_descriptions.txt', 'train',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len)
    ds.setOutput(base_path + 'val_descriptions.txt', 'val',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)
    ds.setOutput(base_path + 'test_descriptions.txt', 'test',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)

    # Let's load the associated videos (inputs)
    #    we must take into account that in this dataset we have a different number of sentences per video, 
    #    for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
    #    containing the number of captions in each video.

    num_captions_train = np.load(base_path + 'train_descriptions_counts.npy')
    num_captions_val = np.load(base_path + 'val_descriptions_counts.npy')
    num_captions_test = np.load(base_path + 'test_descriptions_counts.npy')

    ds.setInput([base_path + 'train_imgs_list.txt', base_path + 'train_imgs_counts.txt'],
                'train', type='video', id='videos',
                repeat_set=num_captions_train)
    ds.setInput([base_path + 'val_imgs_list.txt', base_path + 'val_imgs_counts.txt'],
                'val', type='video', id='videos',
                repeat_set=num_captions_val)
    ds.setInput([base_path + 'test_imgs_list.txt', base_path + 'test_imgs_counts.txt'],
                'test', type='video', id='videos',
                repeat_set=num_captions_test)

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], id='videos')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
Пример #5
0
def loadMSVD():
    logging.info('Loading MSVD dataset')

    # Build basic dataset structure
    #    we assign it a name and the path were the images are stored

    base_path = '/media/HDD_2TB/DATASETS/MSVD/'
    name = 'MSVD_VideoDescription'
    ds = Dataset(name, base_path)
    max_text_len = 35

    # Let's load the train, val and test splits of the descriptions (outputs)
    #    the files include a description per line. In this dataset a variable number
    #    of descriptions per video are provided.

    ds.setOutput(base_path + 'train_descriptions.txt', 'train',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len)
    ds.setOutput(base_path + 'val_descriptions.txt', 'val',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)
    ds.setOutput(base_path + 'test_descriptions.txt', 'test',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)

    # Let's load the associated videos (inputs)
    #    we must take into account that in this dataset we have a different number of sentences per video,
    #    for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
    #    containing the number of captions in each video.

    num_captions_train = np.load(base_path + 'train_descriptions_counts.npy')
    num_captions_val = np.load(base_path + 'val_descriptions_counts.npy')
    num_captions_test = np.load(base_path + 'test_descriptions_counts.npy')

    ds.setInput([base_path + 'train_imgs_list.txt', base_path + 'train_imgs_counts.txt'],
                'train', type='video', id='videos',
                repeat_set=num_captions_train)
    ds.setInput([base_path + 'val_imgs_list.txt', base_path + 'val_imgs_counts.txt'],
                'val', type='video', id='videos',
                repeat_set=num_captions_val)
    ds.setInput([base_path + 'test_imgs_list.txt', base_path + 'test_imgs_counts.txt'],
                'test', type='video', id='videos',
                repeat_set=num_captions_test)

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='videos')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
Пример #6
0
def test_load_dataset():
    params = load_parameters()
    ds = loadDataset(os.path.join('datasets',
                                  'Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[
                                      'TRG_LAN'] + '.pkl'))
    assert isinstance(ds, Dataset)
    assert isinstance(ds.vocabulary, dict)
    assert len(list(ds.vocabulary)) >= 3
    for voc in ds.vocabulary:
        assert len(list(ds.vocabulary[voc])) == 2
Пример #7
0
def score_corpus(args, params):
    print "Using an ensemble of %d models" % len(args.models)
    models = [loadModel(m, -1, full_path=True) for m in args.models]
    dataset = loadDataset(args.dataset)
    if args.source is not None:
        dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits,
                                           output_text_filename=args.target, compute_state_below=True)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
    # Apply scoring
    extra_vars = dict()
    extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD'])
    for s in args.splits:
        # Apply model predictions
        params_prediction = {'max_batch_size': params['BATCH_SIZE'],
                             'n_parallel_loaders': params['PARALLEL_LOADERS'],
                             'predict_on_sets': [s]}

        if params['BEAM_SEARCH']:
            params_prediction['beam_size'] = params['BEAM_SIZE']
            params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST']
            params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH']
            params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL']
            params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL']
            params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET']
            params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
            params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False)
            params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0)
            params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False)
            params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False)
            params_prediction['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0)
            params_prediction['coverage_norm_factor'] = params.get('COVERAGE_NORM_FACTOR', 0.0)
            params_prediction['pos_unk'] = params.get('POS_UNK', False)
            params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \
                else params.get('MAX_OUTPUT_TEXT_LEN', 50)
            params_prediction['output_max_length_depending_on_x'] = params.get('MAXLEN_GIVEN_X', True)
            params_prediction['output_max_length_depending_on_x_factor'] = params.get('MAXLEN_GIVEN_X_FACTOR', 3)
            params_prediction['output_min_length_depending_on_x'] = params.get('MINLEN_GIVEN_X', True)
            params_prediction['output_min_length_depending_on_x_factor'] = params.get('MINLEN_GIVEN_X_FACTOR', 2)
            beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, verbose=args.verbose)
            scores = beam_searcher.scoreNet()[s]

        # Store result
        if args.dest is not None:
            filepath = args.dest  # results file
            if params['SAMPLING_SAVE_MODE'] == 'list':
                list2file(filepath, scores)
            elif params['SAMPLING_SAVE_MODE'] == 'numpy':
                numpy2file(filepath, scores)
            else:
                raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.')
        else:
            print scores
Пример #8
0
def loadFood101():
    logging.info('Loading Food101 dataset')
    logging.info(
        'INFO: in order to load this dataset it must be placed in ../data/Food101/images/ after downloading it form https://www.vision.ee.ethz.ch/datasets_extra/food-101/'
    )

    base_path = '../data/Food101/'
    name = 'Food101'
    ds = Dataset(name, base_path + 'images')

    # Insert inputs (images)
    ds.setInput(base_path + 'meta/train_split.txt',
                'train',
                type='image',
                id='images',
                img_size_crop=[227, 227, 3])
    ds.setInput(base_path + 'meta/val_split.txt',
                'val',
                type='image',
                id='images')
    ds.setInput(base_path + 'meta/test.txt', 'test', type='image', id='images')

    # Insert outputs (labels)
    ds.setOutput(base_path + 'meta/train_labels.txt',
                 'train',
                 type='categorical',
                 id='labels')
    ds.setOutput(base_path + 'meta/val_labels.txt',
                 'val',
                 type='categorical',
                 id='labels')
    ds.setOutput(base_path + 'meta/test_labels.txt',
                 'test',
                 type='categorical',
                 id='labels')

    # Set list of classes (strings)
    ds.setClasses(base_path + 'meta/classes.txt', 'labels')

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067],
                    data_id='images')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
    def __init__(self):
        self.session = tf.Session()
        self.graph = tf.get_default_graph()
        with self.graph.as_default():
            with self.session.as_default():
                dataset = loadDataset("dataset/Dataset_tutorial_dataset.pkl")
                nmt_model = loadModel("", epoch_num)
                params = nmt_model.params
                inputMapping = dict()
                for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
                    pos_source = dataset.ids_inputs.index(id_in)
                    id_dest = nmt_model.ids_inputs[i]
                    inputMapping[id_dest] = pos_source
                nmt_model.setInputsMapping(inputMapping)

                outputMapping = dict()
                for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
                    pos_target = dataset.ids_outputs.index(id_out)
                    id_dest = nmt_model.ids_outputs[i]
                    outputMapping[id_dest] = pos_target
                nmt_model.setOutputsMapping(outputMapping)
                params_prediction = {
                    'language': 'en',
                    'tokenize_f': eval('dataset.' + 'tokenize_basic'),
                    'beam_size': 2,
                    'optimized_search': True,
                    'model_inputs': params['INPUTS_IDS_MODEL'],
                    'model_outputs': params['OUTPUTS_IDS_MODEL'],
                    'dataset_inputs': params['INPUTS_IDS_DATASET'],
                    'dataset_outputs': params['OUTPUTS_IDS_DATASET'],
                    'n_parallel_loaders': 1,
                    'maxlen': 50,
                    'model_inputs': ['source_text', 'state_below'],
                    'model_outputs': ['target_text'],
                    'dataset_inputs': ['source_text', 'state_below'],
                    'dataset_outputs': ['target_text'],
                    'normalize': True,
                    'pos_unk': True,
                    'heuristic': 0,
                    'state_below_maxlen': -1,
                    'length_norm_factor': 1.0,
                    'length_penalty': True,
                    'predict_on_sets': ['test'],
                    'verbose': 0,
                }
                self.params = params
                self.dataset = dataset
                self.nmt_model = nmt_model
                self.params_prediction = params_prediction
Пример #10
0
def loadFlickr8k():
    logging.info('Loading Flickr8k dataset')

    # Build basic dataset structure
    #    we assign it a name and the path were the images are stored

    base_path = '/media/HDD_2TB/DATASETS/Flickr8k/'
    name = 'Flickr8k_ImageDescription'
    ds = Dataset(name, base_path + 'Flicker8k_Dataset')
    max_text_len = 35

    # Let's load the train, val and test splits of the descriptions (outputs)
    #    the files include a description per line 
    #    and a set of 5 consecutive descriptions correspond to a single input image

    ds.setOutput(base_path + 'text/train_descriptions.txt', 'train',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len)
    ds.setOutput(base_path + 'text/val_descriptions.txt', 'val',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)
    ds.setOutput(base_path + 'text/test_descriptions.txt', 'test',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)

    # Let's load the associated images (inputs)
    #    we must take into account that in this dataset we have 5 sentences per image, 
    #    for this reason we introduce the parameter 'repeat_set'=5

    ds.setInput(base_path + 'text/Flickr_8k.trainImages.txt', 'train',
                type='image', id='images', repeat_set=5)
    ds.setInput(base_path + 'text/Flickr_8k.devImages.txt', 'val',
                type='image', id='images', repeat_set=5)
    ds.setInput(base_path + 'text/Flickr_8k.testImages.txt', 'test',
                type='image', id='images', repeat_set=5)

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], id='images')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
Пример #11
0
def loadFlickr8k():
    logging.info('Loading Flickr8k dataset')

    # Build basic dataset structure
    #    we assign it a name and the path were the images are stored

    base_path = '/media/HDD_2TB/DATASETS/Flickr8k/'
    name = 'Flickr8k_ImageDescription'
    ds = Dataset(name, base_path + 'Flicker8k_Dataset')
    max_text_len = 35

    # Let's load the train, val and test splits of the descriptions (outputs)
    #    the files include a description per line
    #    and a set of 5 consecutive descriptions correspond to a single input image

    ds.setOutput(base_path + 'text/train_descriptions.txt', 'train',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', build_vocabulary=True, max_text_len=max_text_len)
    ds.setOutput(base_path + 'text/val_descriptions.txt', 'val',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)
    ds.setOutput(base_path + 'text/test_descriptions.txt', 'test',
                 type='text', id='descriptions',
                 tokenization='tokenize_basic', max_text_len=max_text_len)

    # Let's load the associated images (inputs)
    #    we must take into account that in this dataset we have 5 sentences per image,
    #    for this reason we introduce the parameter 'repeat_set'=5

    ds.setInput(base_path + 'text/Flickr_8k.trainImages.txt', 'train', type='image', id='images', repeat_set=5)
    ds.setInput(base_path + 'text/Flickr_8k.devImages.txt', 'val', type='image', id='images', repeat_set=5)
    ds.setInput(base_path + 'text/Flickr_8k.testImages.txt', 'test', type='image', id='images', repeat_set=5)

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='images')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly. %d input samples. %d output samples' % (len(X), len(Y)))
Пример #12
0
def evaluate_from_file(args):
    """
    Evaluate translation hypotheses from a file or a list of files of references.
    :param args: Evaluation parameters
    :return: None
    """
    language = args.language
    hypotheses_file = codecs.open(args.hypotheses, 'r', encoding='utf-8')
    ds = loadDataset(args.dataset)
    references = ds.extra_variables[args.split][list(
        ds.extra_variables[args.split].keys())[0]]
    step_size = args.step_size
    ref, hypothesis = load_textfiles(references, hypotheses_file)
    if step_size < 1:
        score = CocoScore(ref,
                          hypothesis,
                          metrics_list=args.metrics,
                          language=language)
        print("Scores: ")
        max_score_name_len = max([len(x) for x in list(score)])
        for score_name in sorted(list(score)):
            print("\t {0:{1}}".format(score_name, max_score_name_len) +
                  ": %.5f" % score[score_name])
    else:
        n = 0
        while True:
            n += step_size
            indices = range(min(n, len(ref)))
            partial_refs = {}
            partial_hyps = {}
            for i in indices:
                partial_refs[i] = ref[i]
                partial_hyps[i] = hypothesis[i]
            score = CocoScore(partial_refs,
                              partial_hyps,
                              metrics_list=args.metrics,
                              language=language)
            print(str(min(n, len(ref))) + " \tScore: ", score)
            if n > len(ref):
                break
    return
Пример #13
0
def loadFood101():
    logging.info('Loading Food101 dataset')
    logging.info(
        'INFO: in order to load this dataset it must be placed in ../data/Food101/images/ after downloading it form https://www.vision.ee.ethz.ch/datasets_extra/food-101/')

    base_path = '../data/Food101/'
    name = 'Food101'
    ds = Dataset(name, base_path + 'images')

    # Insert inputs (images)
    ds.setInput(base_path + 'meta/train_split.txt', 'train',
                type='image', id='images', img_size_crop=[227, 227, 3])
    ds.setInput(base_path + 'meta/val_split.txt', 'val',
                type='image', id='images')
    ds.setInput(base_path + 'meta/test.txt', 'test',
                type='image', id='images')

    # Insert outputs (labels)
    ds.setOutput(base_path + 'meta/train_labels.txt', 'train',
                 type='categorical', id='labels')
    ds.setOutput(base_path + 'meta/val_labels.txt', 'val',
                 type='categorical', id='labels')
    ds.setOutput(base_path + 'meta/test_labels.txt', 'test',
                 type='categorical', id='labels')

    # Set list of classes (strings)
    ds.setClasses(base_path + 'meta/classes.txt', 'labels')

    # Now let's set the dataset mean image for preprocessing the data
    ds.setTrainMean(mean_image=[122.6795, 116.6690, 104.0067], data_id='images')

    # We have finished loading the dataset, now we can store it for using it in the future
    saveDataset(ds, 'Datasets')

    # We can easily recover it with a single line
    ds = loadDataset('Datasets/Dataset_' + name + '.pkl')

    # Lets recover the first batch of data
    [X, Y] = ds.getXY('train', 10)
    logging.info('Sample data loaded correctly.')
Пример #14
0
def get_model_predictions(asts_path):
    print("os.getcwd()", os.getcwd())
    cur_dir = os.path.dirname(os.path.abspath(__file__))
    print("cur_dir", cur_dir)

    # if not os.path.isdir(os.path.join(os.getcwd(), 'keras')):
    #     print(subprocess.run(f'git clone https://github.com/MarcBS/keras.git', shell=True,
    #                          stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True))

    # # nmt_keras_dir = os.path.join(os.getcwd, 'nmt-keras')
    # if not os.path.isdir(os.path.join(os.getcwd(), 'nmt-keras')):
    #     print(subprocess.run(f'git clone https://github.com/lvapeab/nmt-keras && cd "nmt-keras" && pipenv install -e .', shell=True,
    #                          stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True))
    #     # print(subprocess.run(f'cd {nmt_keras_dir} && pipenv install -e .', shell=True,
    #     #                      stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True))
    #     print("ran cmds!!!")

    # sys.path.insert(0, os.path.join(os.getcwd(), 'nmt-keras'))
    # print("sys path!!!", sys.path)

    dataset = loadDataset(
        f'{cur_dir}/assets/epoch_{MODEL_EPOCH}_model_wrapper.pkl')
    with open('{cur_dir}/assets/params.json', 'r') as params_file:
        params = json.load(params_file)

    dataset.setInput(asts_path,
                     'test',
                     type='text',
                     id='source_text',
                     pad_on_batch=True,
                     tokenization=params['tokenize_x'],
                     fill='end',
                     max_text_len=params['x_max_text_len'],
                     min_occ=0)

    dataset.setInput(None,
                     'test',
                     type='ghost',
                     id='state_below',
                     required=False)

    dataset.setRawInput(asts_path,
                        'test',
                        type='file-name',
                        id='raw_source_text',
                        overwrite_split=True)

    nmt_model = loadModel('{cur_dir}/assets', MODEL_EPOCH)

    prediction_params = get_prediction_params()

    predictions = nmt_model.predictBeamSearchNet(dataset,
                                                 params_prediction)['test']

    vocab = dataset.vocabulary['target_text']['idx2words']
    samples = predictions['samples']  # Get word indices from the samples.

    predictions = decode_predictions_beam_search(samples,
                                                 vocab,
                                                 verbose=params['VERBOSE'])

    return predictions
Пример #15
0
from keras_wrapper.model_ensemble import BeamSearchEnsemble
import os

"""## 3. Decoding with a trained Neural Machine Translation Model

Now, we'll load from disk the model we just trained and we'll apply it for translating new text. In this case, we want to translate the 'test' split from our dataset.

Since we want to translate a new data split ('test') we must add it to the dataset instance, just as we did before (at the first tutorial). In case we also had the refences of the test split and we wanted to evaluate it, we can add it to the dataset. Note that this is not mandatory and we could just predict without evaluating.
"""
MODEL_PATH1 = os.path.join(os.getcwd(), 'models/empathy_100_hidden')
MODEL_PATH2 = os.path.join(os.getcwd(), 'models/persona_chat_lstm')

epoch_choice1 = 6
epoch_choice2 = 8

dataset1 = loadDataset(os.path.join(MODEL_PATH1, "dataset/Dataset_tutorial_dataset.pkl"))
dataset2 = loadDataset(os.path.join(MODEL_PATH1, "dataset/Dataset_tutorial_dataset.pkl"))

dataset2 = update_dataset_from_file(dataset2, args.text, params, splits=args.splits, remove_outputs=True)

# Load model
nmt_model1 = loadModel(MODEL_PATH1, epoch_choice1)
nmt_model2 = loadModel(MODEL_PATH2, epoch_choice2)


params = nmt_model1.params

# Define the inputs and outputs mapping from our Dataset instance to our model 
inputMapping = dict() 
for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): 
    pos_source = dataset.ids_inputs.index(id_in) 
Пример #16
0
def build_dataset(params):
    
    if params['REBUILD_DATASET']: # We build a new dataset instance
        if(params['VERBOSE'] > 0):
            silence=False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence=True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)
        max_text_len = params['MAX_INPUT_TEXT_LEN']

        ##### INPUT DATA
        ### QUESTIONS
        ds.setInput(base_path+'/'+params['QST_FILES']['train'][0], 'train',
                   type='text', id=params['INPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], build_vocabulary=True, fill=params['FILL'],
                   max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'],
                   repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path+'/'+params['QST_FILES']['val'][0], 'val',
                   type='text', id=params['INPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'],
                   max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'],
                   repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path+'/'+params['QST_FILES']['test'][0], 'test',
                   type='text', id=params['INPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'],
                   max_text_len=params['MAX_INPUT_TEXT_LEN'], max_words=params['INPUT_VOCABULARY_SIZE'],
                   repeat_set=params['REPEAT_QST'])
        ### QUESTIONS' associated IDs
        ds.setInput(base_path+'/'+params['QST_FILES']['train'][1], 'train',
                   type='id', id=params['INPUTS_IDS_DATASET'][0]+'_ids',
                   repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path+'/'+params['QST_FILES']['val'][1], 'val',
                   type='id', id=params['INPUTS_IDS_DATASET'][0]+'_ids',
                   repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path+'/'+params['QST_FILES']['test'][1], 'test',
                   type='id', id=params['INPUTS_IDS_DATASET'][0]+'_ids',
                   repeat_set=params['REPEAT_QST'])
        
        ### IMAGES
        ds.setInput(base_path+'/'+params['IMG_FILES']['train'][0], 'train',
                   type='image-features', id=params['INPUTS_IDS_DATASET'][1],
                   feat_len=params['IMG_FEAT_SIZE'],
                   repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['val'][0], 'val',
                   type='image-features', id=params['INPUTS_IDS_DATASET'][1],
                   feat_len=params['IMG_FEAT_SIZE'],
                   repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['test'][0], 'test',
                   type='image-features', id=params['INPUTS_IDS_DATASET'][1],
                   feat_len=params['IMG_FEAT_SIZE'],
                   repeat_set=params['REPEAT_IMG'])
        ### IMAGES' associated IDs
        ds.setInput(base_path+'/'+params['IMG_FILES']['train'][1], 'train',
                   type='id', id=params['INPUTS_IDS_DATASET'][1]+'_ids',
                   repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['val'][1], 'val',
                   type='id', id=params['INPUTS_IDS_DATASET'][1]+'_ids',
                   repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path+'/'+params['IMG_FILES']['test'][1], 'test',
                   type='id', id=params['INPUTS_IDS_DATASET'][1]+'_ids',
                   repeat_set=params['REPEAT_IMG'])
        

        ##### OUTPUT DATA
        ### ANSWERS
        ds.setOutput(base_path+'/'+params['ANS_FILES']['train'][0], 'train',
                   type='text', id=params['OUTPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], build_vocabulary=True, fill=params['FILL'],
                   max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE'])
        ds.setOutput(base_path+'/'+params['ANS_FILES']['val'][0], 'val',
                   type='text', id=params['OUTPUTS_IDS_DATASET'][0],
                   tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'],
                   max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE'])
        if 'test' in params['ANS_FILES']:
            ds.setOutput(base_path+'/'+params['ANS_FILES']['test'][0], 'test',
                       type='text', id=params['OUTPUTS_IDS_DATASET'][0],
                       tokenization=params['TOKENIZATION_METHOD'], fill=params['FILL'],
                       max_text_len=params['MAX_OUTPUT_TEXT_LEN'], max_words=params['OUTPUT_VOCABULARY_SIZE'])

        
        # Load extra variables (we need the original path to questions and annotations for VQA evaluation)
        ds.extra_variables['train'] = dict()
        ds.extra_variables['val'] = dict()
        ds.extra_variables['test'] = dict()
        
        ds.extra_variables['train']['quesFile'] = base_path+'/'+params['QST_FILES']['train'][2]
        ds.extra_variables['val']['quesFile'] = base_path+'/'+params['QST_FILES']['val'][2]
        ds.extra_variables['test']['quesFile'] = base_path+'/'+params['QST_FILES']['test'][2]
        
        ds.extra_variables['train']['annFile'] = base_path+'/'+params['ANS_FILES']['train'][1]
        ds.extra_variables['val']['annFile'] = base_path+'/'+params['ANS_FILES']['val'][1]
        if 'test' in params['ANS_FILES']:
            ds.extra_variables['test']['annFile'] = base_path+'/'+params['ANS_FILES']['tes'][1]
        
        
        # Remove all samples of the train set not belonging to the top classes chosen
        if params['KEEP_TOP_ANSWERS']:
            ds.keepTopOutputs('train', params['OUTPUTS_IDS_DATASET'][0], params['OUTPUT_VOCABULARY_SIZE'])
        # Filter top K answers per question-image pair
        if params['FILTER_ANSWERS']:
            filter_k_frequent_answers(ds, params)
        
        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATA_ROOT_PATH'])
    
    
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATA_ROOT_PATH']+'/Dataset_'+params['DATASET_NAME']+'.pkl')

    return ds
Пример #17
0
def classifyFood101():
    from keras_wrapper.cnn_model import CNN_Model, loadModel, saveModel
    logging.info('Defining CNN model and training it.')

    # Load food classification dataset
    dataset_name = 'Food101'
    ds = loadDataset('Datasets/Dataset_' + dataset_name + '.pkl')
    # The network we are going to use needs an image input size of [224,224,3]
    # for this reason we have to communicate this to the dataset instance in charge of loading the data
    ds.img_size_crop['images'] = [224, 224, 3]

    # Create VGG model and load weights
    model_name = 'VGG_16_FunctionalAPI'
    net = CNN_Model(
        type='VGG_16_FunctionalAPI',
        model_name=model_name,
        input_shape=[224, 224, 3],
        weights_path='/media/HDD_2TB/CNN_MODELS/VGG/vgg16_weights.h5',
        seq_to_functional=True
    )  # we are setting the weights of a Sequential model into a FunctionalAPI one

    # Reformat net output layer for the number of classes in our dataset
    n_classes = len(ds.classes['labels'])
    vis_input = net.model.get_layer('vis_input').output  # input layer
    drop = net.model.get_layer('last_dropout').output  # layer before final FC
    output = Dense(n_classes, activation='softmax',
                   name='output')(drop)  # redefine FC-softmax layer
    net.model = Model(input=vis_input,
                      output=output)  # define inputs and outputs

    # Compile
    net.setOptimizer(lr=0.001, metrics=['accuracy'])

    # Define the inputs and outputs mapping from our Dataset instance to our CNN_Model instance
    # set input and output mappings from dataset to network
    pos_images = ds.types_inputs.index('image')
    pos_labels = ds.types_outputs.index('categorical')

    # the first input of our dataset (pos_images) will also be the first input of our model (named vis_input)
    inputMapping = {'vis_input': pos_images}
    net.setInputsMapping(inputMapping)

    # the first output of our dataset (pos_labels) will also be the first output of our model (named output)
    outputMapping = {'output': pos_labels}
    net.setOutputsMapping(outputMapping, acc_output='output')

    # Save model
    saveModel(net, 0)

    # Load model
    net = loadModel('Models/' + model_name, 0)
    # the model must be compiled again when loaded
    net.setOptimizer(lr=0.001, metrics=['accuracy'])

    # Apply short training (1 epoch)
    # training_params = {'n_epochs': 1, 'batch_size': 50,
    #                    'lr_decay': 2, 'lr_gamma': 0.8,
    #                    'epochs_for_save': 1, 'verbose': 1, 'eval_on_sets': ['val']}
    # net.trainNet(ds, training_params)

    # Test network on test set
    test_params = {'batch_size': 50}
    # net.testNet(ds, test_params)

    # Predict network on all sets
    test_params['predict_on_sets'] = ['val']
    predictions = net.predictNet(ds, test_params)
    logging.info("Predicted %d samples." % (len(predictions)))
    logging.info("Done")
Пример #18
0
def score_corpus(args, params):
    """
    Use one or several translation models for scoring source--target pairs-

    :param argparse.Namespace args: Arguments given to the method:

                                * dataset: Dataset instance with data.
                                * source: Text file with source sentences.
                                * target: Text file with target sentences.
                                * splits: Splits to sample. Should be already included in the dataset object.
                                * dest: Output file to save scores.
                                * weights: Weight given to each model in the ensemble. You should provide the same number of weights than models. By default, it applies the same weight to each model (1/N).
                                * verbose: Be verbose or not.
                                * config: Config .pkl for loading the model configuration. If not specified, hyperparameters are read from config.py.
                                * models: Path to the models.
    :param dict params: parameters of the translation model.
    """

    from data_engine.prepare_data import update_dataset_from_file
    from keras_wrapper.dataset import loadDataset
    from keras_wrapper.cnn_model import loadModel
    from keras_wrapper.model_ensemble import BeamSearchEnsemble

    logging.info("Using an ensemble of %d models" % len(args.models))
    models = [loadModel(m, -1, full_path=True) for m in args.models]
    dataset = loadDataset(args.dataset)
    dataset = update_dataset_from_file(dataset,
                                       args.source,
                                       params,
                                       splits=args.splits,
                                       output_text_filename=args.target,
                                       compute_state_below=True)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]
    # Apply scoring
    extra_vars = dict()
    extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD'])

    model_weights = args.weights
    if model_weights is not None and model_weights != []:
        assert len(model_weights) == len(
            models
        ), 'You should give a weight to each model. You gave %d models and %d weights.' % (
            len(models), len(model_weights))
        model_weights = map(float, model_weights)
        if len(model_weights) > 1:
            logger.info('Giving the following weights to each model: %s' %
                        str(model_weights))

    for s in args.splits:
        # Apply model predictions
        params_prediction = {
            'max_batch_size': params['BATCH_SIZE'],
            'n_parallel_loaders': params['PARALLEL_LOADERS'],
            'predict_on_sets': [s]
        }

        if params['BEAM_SEARCH']:
            params_prediction['beam_size'] = params['BEAM_SIZE']
            params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST']
            params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH']
            params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL']
            params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL']
            params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET']
            params_prediction['dataset_outputs'] = params[
                'OUTPUTS_IDS_DATASET']
            params_prediction['normalize_probs'] = params.get(
                'NORMALIZE_SAMPLING', False)
            params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0)
            params_prediction['coverage_penalty'] = params.get(
                'COVERAGE_PENALTY', False)
            params_prediction['length_penalty'] = params.get(
                'LENGTH_PENALTY', False)
            params_prediction['length_norm_factor'] = params.get(
                'LENGTH_NORM_FACTOR', 0.0)
            params_prediction['coverage_norm_factor'] = params.get(
                'COVERAGE_NORM_FACTOR', 0.0)
            params_prediction['pos_unk'] = params.get('POS_UNK', False)
            params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \
                else params.get('MAX_OUTPUT_TEXT_LEN', 50)
            params_prediction['output_max_length_depending_on_x'] = params.get(
                'MAXLEN_GIVEN_X', True)
            params_prediction[
                'output_max_length_depending_on_x_factor'] = params.get(
                    'MAXLEN_GIVEN_X_FACTOR', 3)
            params_prediction['output_min_length_depending_on_x'] = params.get(
                'MINLEN_GIVEN_X', True)
            params_prediction[
                'output_min_length_depending_on_x_factor'] = params.get(
                    'MINLEN_GIVEN_X_FACTOR', 2)
            params_prediction['attend_on_output'] = params.get(
                'ATTEND_ON_OUTPUT', 'transformer'
                in params['MODEL_TYPE'].lower())
            beam_searcher = BeamSearchEnsemble(models,
                                               dataset,
                                               params_prediction,
                                               model_weights=model_weights,
                                               verbose=args.verbose)
            scores = beam_searcher.scoreNet()[s]

        # Store result
        if args.dest is not None:
            filepath = args.dest  # results file
            if params['SAMPLING_SAVE_MODE'] == 'list':
                list2file(filepath, scores)
            elif params['SAMPLING_SAVE_MODE'] == 'numpy':
                numpy2file(filepath, scores)
            else:
                raise Exception('The sampling mode ' +
                                params['SAMPLING_SAVE_MODE'] +
                                ' is not currently supported.')
        else:
            print(scores)
Пример #19
0
def classifyFood101():
    from keras_wrapper.cnn_model import CNN_Model, loadModel, saveModel
    logging.info('Defining CNN model and training it.')

    # Load food classification dataset
    dataset_name = 'Food101'
    ds = loadDataset('Datasets/Dataset_' + dataset_name + '.pkl')
    # The network we are going to use needs an image input size of [224,224,3]
    # for this reason we have to communicate this to the dataset instance in charge of loading the data
    ds.img_size_crop['images'] = [224, 224, 3]

    # Create VGG model and load weights
    model_name = 'VGG_16_FunctionalAPI'
    net = CNN_Model(type='VGG_16_FunctionalAPI', model_name=model_name, input_shape=[224, 224, 3],
                    weights_path='/media/HDD_2TB/CNN_MODELS/VGG/vgg16_weights.h5',
                    seq_to_functional=True)  # we are setting the weights of a Sequential model into a FunctionalAPI one

    # Reformat net output layer for the number of classes in our dataset
    n_classes = len(ds.classes['labels'])
    vis_input = net.model.get_layer('vis_input').output  # input layer
    drop = net.model.get_layer('last_dropout').output  # layer before final FC
    output = Dense(n_classes, activation='softmax', name='output')(drop)  # redefine FC-softmax layer
    net.model = Model(input=vis_input, output=output)  # define inputs and outputs

    # Compile
    net.setOptimizer(lr=0.001, metrics=['accuracy'])

    # Define the inputs and outputs mapping from our Dataset instance to our CNN_Model instance
    # set input and output mappings from dataset to network
    pos_images = ds.types_inputs.index('image')
    pos_labels = ds.types_outputs.index('categorical')

    # the first input of our dataset (pos_images) will also be the first input of our model (named vis_input)
    inputMapping = {'vis_input': pos_images}
    net.setInputsMapping(inputMapping)

    # the first output of our dataset (pos_labels) will also be the first output of our model (named output)
    outputMapping = {'output': pos_labels}
    net.setOutputsMapping(outputMapping, acc_output='output')

    # Save model
    saveModel(net, 0)

    # Load model
    net = loadModel('Models/' + model_name, 0)
    # the model must be compiled again when loaded
    net.setOptimizer(lr=0.001, metrics=['accuracy'])

    # Apply short training (1 epoch)
    # training_params = {'n_epochs': 1, 'batch_size': 50,
    #                    'lr_decay': 2, 'lr_gamma': 0.8,
    #                    'epochs_for_save': 1, 'verbose': 1, 'eval_on_sets': ['val']}
    # net.trainNet(ds, training_params)

    # Test network on test set
    test_params = {'batch_size': 50}
    # net.testNet(ds, test_params)

    # Predict network on all sets
    test_params['predict_on_sets'] = ['val']
    predictions = net.predictNet(ds, test_params)
    logging.info("Predicted %d samples." % (len(predictions)))
    logging.info("Done")
def build_dataset(params):
    if params['REBUILD_DATASET']:

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=False)

        # INPUT DATA
        ds.setInput(base_path + '/' + params['DISHES_FILES']['train'],
                    'train',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][1],
                    build_vocabulary=True,
                    tokenization=params['TOKENIZATION_METHOD'],
                    fill=params['FILL'],
                    pad_on_batch=True,
                    max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                    min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ds.setInput(base_path + '/' + params['DISHES_FILES']['val'],
                    'val',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][1],
                    build_vocabulary=True,
                    pad_on_batch=True,
                    tokenization=params['TOKENIZATION_METHOD'],
                    max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                    min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ds.setInput(base_path + '/' + params['DISHES_FILES']['test'],
                    'test',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][1],
                    build_vocabulary=True,
                    pad_on_batch=True,
                    tokenization=params['TOKENIZATION_METHOD'],
                    max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                    min_occ=params['MIN_OCCURRENCES_VOCAB'])

        # INPUT DATA
        ds.setInput(base_path + '/' + params['IMAGES_LIST_FILES']['train'],
                    'train',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][0],
                    feat_len=params['IMG_FEAT_SIZE'])

        ds.setInput(base_path + '/' + params['IMAGES_LIST_FILES']['val'],
                    'val',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][0],
                    feat_len=params['IMG_FEAT_SIZE'])

        ds.setInput(base_path + '/' + params['IMAGES_LIST_FILES']['test'],
                    'test',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][0],
                    feat_len=params['IMG_FEAT_SIZE'])

        # INPUT DATA
        ds.setInput(base_path + '/' + params['CNN_FILES']['train'],
                    'train',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][2],
                    feat_len=params['CNN_SIZE'])

        ds.setInput(base_path + '/' + params['CNN_FILES']['val'],
                    'val',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][2],
                    feat_len=params['CNN_SIZE'])

        ds.setInput(base_path + '/' + params['CNN_FILES']['test'],
                    'test',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][2],
                    feat_len=params['CNN_SIZE'])

        # OUTPUT DATA
        if "sample_weight" not in params or params['sample_weight']:
            ds.setOutput(base_path + '/' + params['OUT_FILES']['train'],
                         'train',
                         type='real',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         sample_weights=np.load(Path.DATA_FOLDER +
                                                "/data/weights.npy"))
        else:
            ds.setOutput(base_path + '/' + params['OUT_FILES']['train'],
                         'train',
                         type='real',
                         id=params['OUTPUTS_IDS_DATASET'][0])

        ds.setOutput(base_path + '/' + params['OUT_FILES']['val'],
                     'val',
                     type='real',
                     id=params['OUTPUTS_IDS_DATASET'][0])

        ds.setOutput(base_path + '/' + params['OUT_FILES']['test'],
                     'test',
                     type='real',
                     id=params['OUTPUTS_IDS_DATASET'][0])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '.pkl')
    return ds
Пример #21
0
import sys

sys.path.append('../nmt-keras')
sys.path.append('../nmt-keras/nmt_keras')

import utils
from config import load_parameters
from data_engine.prepare_data import keep_n_captions
from keras_wrapper.cnn_model import loadModel
from keras_wrapper.dataset import loadDataset
from keras_wrapper.utils import decode_predictions_beam_search
from model_zoo import TranslationModel

params = load_parameters()
dataset = loadDataset('query_to_reply/Dataset_Cornell_base.pkl')

dataset.setInput('data/Ross_test.query',
                 'test',
                 type='text',
                 id='source_text',
                 pad_on_batch=True,
                 tokenization='tokenize_basic',
                 fill='end',
                 max_text_len=100,
                 min_occ=0)

dataset.setInput(None, 'test', type='ghost', id='state_below', required=False)

## get model predictions
params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
    params['INPUTS_IDS_DATASET'][0]]
Пример #22
0
def train_model(params, load_dataset=None):
    """
    Training function. Sets the training parameters from params. Build or loads the model and launches the training.
    :param params: Dictionary of network hyperparameters.
    :param load_dataset: Load dataset from file or build it from the parameters.
    :return: None
    """
    check_params(params)

    if params['RELOAD'] > 0:
        logging.info('Resuming training.')
        # Load data
        if load_dataset is None:
            if params['REBUILD_DATASET']:
                logging.info('Rebuilding dataset.')
                dataset = build_dataset(params)
            else:
                logging.info('Updating dataset.')
                dataset = loadDataset(params['DATASET_STORE_PATH'] +
                                      '/Dataset_' + params['DATASET_NAME'] +
                                      '_' + params['SRC_LAN'] +
                                      params['TRG_LAN'] + '.pkl')
                params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \
                    int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train)
                for split, filename in params['TEXT_FILES'].iteritems():
                    dataset = update_dataset_from_file(
                        dataset,
                        params['DATA_ROOT_PATH'] + '/' + filename +
                        params['SRC_LAN'],
                        params,
                        splits=list([split]),
                        output_text_filename=params['DATA_ROOT_PATH'] + '/' +
                        filename + params['TRG_LAN'],
                        remove_outputs=False,
                        compute_state_below=True,
                        recompute_references=True)
                    dataset.name = params['DATASET_NAME'] + '_' + params[
                        'SRC_LAN'] + params['TRG_LAN']
                saveDataset(dataset, params['DATASET_STORE_PATH'])

        else:
            logging.info('Reloading and using dataset.')
            dataset = loadDataset(load_dataset)
    else:
        # Load data
        if load_dataset is None:
            dataset = build_dataset(params)
        else:
            dataset = loadDataset(load_dataset)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]

    # Build model
    set_optimizer = True if params['RELOAD'] == 0 else False
    clear_dirs = True if params['RELOAD'] == 0 else False

    # build new model
    nmt_model = TranslationModel(params,
                                 model_type=params['MODEL_TYPE'],
                                 verbose=params['VERBOSE'],
                                 model_name=params['MODEL_NAME'],
                                 vocabularies=dataset.vocabulary,
                                 store_path=params['STORE_PATH'],
                                 set_optimizer=set_optimizer,
                                 clear_dirs=clear_dirs)

    # Define the inputs and outputs mapping from our Dataset instance to our model
    inputMapping = dict()
    for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
        pos_source = dataset.ids_inputs.index(id_in)
        id_dest = nmt_model.ids_inputs[i]
        inputMapping[id_dest] = pos_source
    nmt_model.setInputsMapping(inputMapping)

    outputMapping = dict()
    for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
        pos_target = dataset.ids_outputs.index(id_out)
        id_dest = nmt_model.ids_outputs[i]
        outputMapping[id_dest] = pos_target
    nmt_model.setOutputsMapping(outputMapping)

    if params['RELOAD'] > 0:
        nmt_model = updateModel(nmt_model,
                                params['STORE_PATH'],
                                params['RELOAD'],
                                reload_epoch=params['RELOAD_EPOCH'])
        nmt_model.setParams(params)
        nmt_model.setOptimizer()
        if params.get('EPOCH_OFFSET') is None:
            params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \
                int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train)

    # Store configuration as pkl
    dict2pkl(params, params['STORE_PATH'] + '/config')

    # Callbacks
    callbacks = buildCallbacks(params, nmt_model, dataset)

    # Training
    total_start_time = timer()

    logger.debug('Starting training!')
    training_params = {
        'n_epochs':
        params['MAX_EPOCH'],
        'batch_size':
        params['BATCH_SIZE'],
        'homogeneous_batches':
        params['HOMOGENEOUS_BATCHES'],
        'maxlen':
        params['MAX_OUTPUT_TEXT_LEN'],
        'joint_batches':
        params['JOINT_BATCHES'],
        'lr_decay':
        params.get('LR_DECAY', None),  # LR decay parameters
        'reduce_each_epochs':
        params.get('LR_REDUCE_EACH_EPOCHS', True),
        'start_reduction_on_epoch':
        params.get('LR_START_REDUCTION_ON_EPOCH', 0),
        'lr_gamma':
        params.get('LR_GAMMA', 0.9),
        'lr_reducer_type':
        params.get('LR_REDUCER_TYPE', 'linear'),
        'lr_reducer_exp_base':
        params.get('LR_REDUCER_EXP_BASE', 0),
        'lr_half_life':
        params.get('LR_HALF_LIFE', 50000),
        'epochs_for_save':
        params['EPOCHS_FOR_SAVE'],
        'verbose':
        params['VERBOSE'],
        'eval_on_sets':
        params['EVAL_ON_SETS_KERAS'],
        'n_parallel_loaders':
        params['PARALLEL_LOADERS'],
        'extra_callbacks':
        callbacks,
        'reload_epoch':
        params['RELOAD'],
        'epoch_offset':
        params.get('EPOCH_OFFSET', 0),
        'data_augmentation':
        params['DATA_AUGMENTATION'],
        'patience':
        params.get('PATIENCE', 0),  # early stopping parameters
        'metric_check':
        params.get('STOP_METRIC', None)
        if params.get('EARLY_STOP', False) else None,
        'eval_on_epochs':
        params.get('EVAL_EACH_EPOCHS', True),
        'each_n_epochs':
        params.get('EVAL_EACH', 1),
        'start_eval_on_epoch':
        params.get('START_EVAL_ON_EPOCH', 0),
        'tensorboard':
        params.get('TENSORBOARD', False),
        'tensorboard_params': {
            'log_dir':
            params.get('LOG_DIR', 'tensorboard_logs'),
            'histogram_freq':
            params.get('HISTOGRAM_FREQ', 0),
            'batch_size':
            params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']),
            'write_graph':
            params.get('WRITE_GRAPH', True),
            'write_grads':
            params.get('WRITE_GRADS', False),
            'write_images':
            params.get('WRITE_IMAGES', False),
            'embeddings_freq':
            params.get('EMBEDDINGS_FREQ', 0),
            'embeddings_layer_names':
            params.get('EMBEDDINGS_LAYER_NAMES', None),
            'embeddings_metadata':
            params.get('EMBEDDINGS_METADATA', None),
            'label_word_embeddings_with_vocab':
            params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False),
            'word_embeddings_labels':
            params.get('WORD_EMBEDDINGS_LABELS', None),
        }
    }
    nmt_model.trainNet(dataset, training_params)

    total_end_time = timer()
    time_difference = total_end_time - total_start_time
    logging.info('In total is {0:.2f}s = {1:.2f}m'.format(
        time_difference, time_difference / 60.0))
Пример #23
0
from keras_wrapper.utils import decode_predictions_beam_search
from config import load_parameters
from keras_wrapper.extra.read_write import list2file
import os
import re

"""## 3. Decoding with a trained Neural Machine Translation Model

Now, we'll load from disk the model we just trained and we'll apply it for translating new text. In this case, we want to translate the 'test' split from our dataset.

Since we want to translate a new data split ('test') we must add it to the dataset instance, just as we did before (at the first tutorial). In case we also had the refences of the test split and we wanted to evaluate it, we can add it to the dataset. Note that this is not mandatory and we could just predict without evaluating.
"""
DATA_PATH = os.path.join(os.getcwd(), 'data/PersonaChat/')
MODEL_PATH = os.path.join(os.getcwd(), 'models/persona_chat_context_lstm_13_de_layers')

dataset = loadDataset(os.path.join(MODEL_PATH, "dataset/Dataset_tutorial_dataset.pkl"))

epoch_choice = 17
# Load model
nmt_model = loadModel(MODEL_PATH, epoch_choice)

params = load_parameters()

params_prediction = {
    'language': 'en',
    'tokenize_f': eval('dataset.' + 'tokenize_basic'),
    'beam_size': 6,
    'optimized_search': True,
    'model_inputs': params['INPUTS_IDS_MODEL'],
    'model_outputs': params['OUTPUTS_IDS_MODEL'],
    'dataset_inputs':  params['INPUTS_IDS_DATASET'],
Пример #24
0
def sample_ensemble(args, params):

    from data_engine.prepare_data import update_dataset_from_file
    from keras_wrapper.model_ensemble import BeamSearchEnsemble
    from keras_wrapper.cnn_model import loadModel
    from keras_wrapper.dataset import loadDataset
    from keras_wrapper.utils import decode_predictions_beam_search

    logging.info("Using an ensemble of %d models" % len(args.models))
    models = [loadModel(m, -1, full_path=True) for m in args.models]
    dataset = loadDataset(args.dataset)
    dataset = update_dataset_from_file(dataset,
                                       args.text,
                                       params,
                                       splits=args.splits,
                                       remove_outputs=True)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]
    # For converting predictions into sentences
    index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET']
                                      [0]]['idx2words']

    if params.get('APPLY_DETOKENIZATION', False):
        detokenize_function = eval('dataset.' +
                                   params['DETOKENIZATION_METHOD'])

    params_prediction = dict()
    params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20)
    params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1)
    params_prediction['beam_size'] = params.get('BEAM_SIZE', 6)
    params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100)
    params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH']
    params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL']
    params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL']
    params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET']
    params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
    params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False)
    params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING',
                                                      False)
    params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0)
    params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY',
                                                       False)
    params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False)
    params_prediction['length_norm_factor'] = params.get(
        'LENGTH_NORM_FACTOR', 0.0)
    params_prediction['coverage_norm_factor'] = params.get(
        'COVERAGE_NORM_FACTOR', 0.0)
    params_prediction['pos_unk'] = params.get('POS_UNK', False)
    params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \
        else params.get('MAX_OUTPUT_TEXT_LEN', 50)
    params_prediction['output_max_length_depending_on_x'] = params.get(
        'MAXLEN_GIVEN_X', True)
    params_prediction['output_max_length_depending_on_x_factor'] = params.get(
        'MAXLEN_GIVEN_X_FACTOR', 3)
    params_prediction['output_min_length_depending_on_x'] = params.get(
        'MINLEN_GIVEN_X', True)
    params_prediction['output_min_length_depending_on_x_factor'] = params.get(
        'MINLEN_GIVEN_X_FACTOR', 2)
    params_prediction['attend_on_output'] = params.get(
        'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower())

    heuristic = params.get('HEURISTIC', 0)
    mapping = None if dataset.mapping == dict() else dataset.mapping
    model_weights = args.weights

    if model_weights is not None and model_weights != []:
        assert len(model_weights) == len(
            models
        ), 'You should give a weight to each model. You gave %d models and %d weights.' % (
            len(models), len(model_weights))
        model_weights = map(lambda x: float(x), model_weights)
        if len(model_weights) > 1:
            logger.info('Giving the following weights to each model: %s' %
                        str(model_weights))
    for s in args.splits:
        # Apply model predictions
        params_prediction['predict_on_sets'] = [s]
        beam_searcher = BeamSearchEnsemble(models,
                                           dataset,
                                           params_prediction,
                                           model_weights=model_weights,
                                           n_best=args.n_best,
                                           verbose=args.verbose)
        if args.n_best:
            predictions, n_best = beam_searcher.predictBeamSearchNet()[s]
        else:
            predictions = beam_searcher.predictBeamSearchNet()[s]
            n_best = None
        if params_prediction['pos_unk']:
            samples = predictions[0]
            alphas = predictions[1]
            sources = [
                x.strip() for x in open(args.text, 'r').read().split('\n')
            ]
            sources = sources[:-1] if len(sources[-1]) == 0 else sources
        else:
            samples = predictions
            alphas = None
            heuristic = None
            sources = None

        predictions = decode_predictions_beam_search(samples,
                                                     index2word_y,
                                                     alphas=alphas,
                                                     x_text=sources,
                                                     heuristic=heuristic,
                                                     mapping=mapping,
                                                     verbose=args.verbose)
        # Apply detokenization function if needed
        if params.get('APPLY_DETOKENIZATION', False):
            predictions = map(detokenize_function, predictions)

        if args.n_best:
            n_best_predictions = []
            for i, (n_best_preds, n_best_scores,
                    n_best_alphas) in enumerate(n_best):
                n_best_sample_score = []
                for n_best_pred, n_best_score, n_best_alpha in zip(
                        n_best_preds, n_best_scores, n_best_alphas):
                    pred = decode_predictions_beam_search(
                        [n_best_pred],
                        index2word_y,
                        alphas=[n_best_alpha]
                        if params_prediction['pos_unk'] else None,
                        x_text=[sources[i]]
                        if params_prediction['pos_unk'] else None,
                        heuristic=heuristic,
                        mapping=mapping,
                        verbose=args.verbose)
                    # Apply detokenization function if needed
                    if params.get('APPLY_DETOKENIZATION', False):
                        pred = map(detokenize_function, pred)

                    n_best_sample_score.append([i, pred, n_best_score])
                n_best_predictions.append(n_best_sample_score)
        # Store result
        if args.dest is not None:
            filepath = args.dest  # results file
            if params.get('SAMPLING_SAVE_MODE', 'list'):
                list2file(filepath, predictions)
                if args.n_best:
                    nbest2file(filepath + '.nbest', n_best_predictions)
            else:
                raise Exception(
                    'Only "list" is allowed in "SAMPLING_SAVE_MODE"')
        else:
            list2stdout(predictions)
            if args.n_best:
                logging.info('Storing n-best sentences in ./' + s + '.nbest')
                nbest2file('./' + s + '.nbest', n_best_predictions)
        logging.info('Sampling finished')
Пример #25
0
def main():
    args = parse_args()
    server_address = ('', args.port)
    httpd = BaseHTTPServer.HTTPServer(server_address, NMTHandler)

    if args.config is None:
        logging.info("Reading parameters from config.py")
        from config import load_parameters
        params = load_parameters()
    else:
        logging.info("Loading parameters from %s" % str(args.config))
        params = pkl2dict(args.config)
    try:
        for arg in args.changes:
            try:
                k, v = arg.split('=')
            except ValueError:
                print 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str(
                    args.changes)
                exit(1)
            try:
                params[k] = ast.literal_eval(v)
            except ValueError:
                params[k] = v
    except ValueError:
        print 'Error processing arguments: (', k, ",", v, ")"
        exit(2)
    dataset = loadDataset(args.dataset)

    # For converting predictions into sentences
    # Dataset backwards compatibility
    bpe_separator = dataset.BPE_separator if hasattr(
        dataset,
        "BPE_separator") and dataset.BPE_separator is not None else '@@'
    # Build BPE tokenizer if necessary
    if 'bpe' in params['TOKENIZATION_METHOD'].lower():
        logger.info('Building BPE')
        if not dataset.BPE_built:
            dataset.build_bpe(
                params.get('BPE_CODES_PATH',
                           params['DATA_ROOT_PATH'] + '/training_codes.joint'),
                bpe_separator)
    # Build tokenization function
    tokenize_f = eval('dataset.' +
                      params.get('TOKENIZATION_METHOD', 'tokenize_none'))

    detokenize_function = eval(
        'dataset.' + params.get('DETOKENIZATION_METHOD', 'detokenize_none'))
    dataset.build_moses_tokenizer(language=params['SRC_LAN'])
    dataset.build_moses_detokenizer(language=params['TRG_LAN'])
    tokenize_general = dataset.tokenize_moses
    detokenize_general = dataset.detokenize_moses

    params_prediction = dict()
    params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20)
    params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1)
    params_prediction['beam_size'] = params.get('BEAM_SIZE', 6)
    params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100)
    params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH']
    params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL']
    params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL']
    params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET']
    params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
    params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False)
    params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING',
                                                      False)
    params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0)
    params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY',
                                                       False)
    params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False)
    params_prediction['length_norm_factor'] = params.get(
        'LENGTH_NORM_FACTOR', 0.0)
    params_prediction['coverage_norm_factor'] = params.get(
        'COVERAGE_NORM_FACTOR', 0.0)
    params_prediction['pos_unk'] = params.get('POS_UNK', False)
    params_prediction['heuristic'] = params.get('HEURISTIC', 0)

    params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \
        else params.get('MAX_OUTPUT_TEXT_LEN', 50)
    params_prediction['output_max_length_depending_on_x'] = params.get(
        'MAXLEN_GIVEN_X', True)
    params_prediction['output_max_length_depending_on_x_factor'] = params.get(
        'MAXLEN_GIVEN_X_FACTOR', 3)
    params_prediction['output_min_length_depending_on_x'] = params.get(
        'MINLEN_GIVEN_X', True)
    params_prediction['output_min_length_depending_on_x_factor'] = params.get(
        'MINLEN_GIVEN_X_FACTOR', 2)
    # Manage pos_unk strategies
    if params['POS_UNK']:
        mapping = None if dataset.mapping == dict() else dataset.mapping
    else:
        mapping = None

    if args.online:
        logging.info('Loading models from %s' % str(args.models))

        model_instances = [
            TranslationModel(params,
                             model_type=params['MODEL_TYPE'],
                             verbose=params['VERBOSE'],
                             model_name=params['MODEL_NAME'] + '_' + str(i),
                             vocabularies=dataset.vocabulary,
                             store_path=params['STORE_PATH'],
                             set_optimizer=False)
            for i in range(len(args.models))
        ]
        models = [
            updateModel(model, path, -1, full_path=True)
            for (model, path) in zip(model_instances, args.models)
        ]

        # Set additional inputs to models if using a custom loss function
        params['USE_CUSTOM_LOSS'] = True if 'PAS' in params[
            'OPTIMIZER'] else False
        if params['N_BEST_OPTIMIZER']:
            logging.info('Using N-best optimizer')

        models = build_online_models(models, params)
        online_trainer = OnlineTrainer(models,
                                       dataset,
                                       None,
                                       None,
                                       params_training,
                                       verbose=args.verbose)
    else:
        models = [loadModel(m, -1, full_path=True) for m in args.models]

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]

    # Get word2index and index2word dictionaries
    index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET']
                                      [0]]['idx2words']
    word2index_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET']
                                      [0]]['words2idx']
    index2word_x = dataset.vocabulary[params['INPUTS_IDS_DATASET']
                                      [0]]['idx2words']
    word2index_x = dataset.vocabulary[params['INPUTS_IDS_DATASET']
                                      [0]]['words2idx']

    excluded_words = None
    interactive_beam_searcher = NMTSampler(models,
                                           dataset,
                                           params_prediction,
                                           tokenize_f,
                                           detokenize_function,
                                           tokenize_general,
                                           detokenize_general,
                                           mapping=mapping,
                                           word2index_x=word2index_x,
                                           word2index_y=word2index_y,
                                           index2word_y=index2word_y,
                                           excluded_words=excluded_words,
                                           verbose=args.verbose)

    # Compile Theano sampling function by generating a fake sample # TODO: Find a better way of doing this
    print "Compiling sampler..."
    interactive_beam_searcher.generate_sample('i')

    httpd.sampler = interactive_beam_searcher

    print 'Server starting at localhost:' + str(args.port)
    httpd.serve_forever()
Пример #26
0
def build_dataset(params):

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if (params['VERBOSE'] > 0):
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)

        ##### OUTPUT DATA
        # Let's load the train, val and test splits of the descriptions (outputs)
        #    the files include a description per line. In this dataset a variable number
        #    of descriptions per video are provided.
        ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
                     'train',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     build_vocabulary=True,
                     tokenization=params['TOKENIZATION_METHOD'],
                     fill=params['FILL'],
                     pad_on_batch=True,
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                     sample_weights=params['SAMPLE_WEIGHTS'],
                     min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['val'],
                     'val',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     build_vocabulary=True,
                     pad_on_batch=True,
                     tokenization=params['TOKENIZATION_METHOD'],
                     sample_weights=params['SAMPLE_WEIGHTS'],
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                     min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ds.setOutput(base_path + '/' + params['DESCRIPTION_FILES']['test'],
                     'test',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     build_vocabulary=True,
                     pad_on_batch=True,
                     tokenization=params['TOKENIZATION_METHOD'],
                     sample_weights=params['SAMPLE_WEIGHTS'],
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN_TEST'],
                     min_occ=params['MIN_OCCURRENCES_VOCAB'])

        ##### INPUT DATA
        # Let's load the associated videos (inputs)
        #    we must take into account that in this dataset we have a different number of sentences per video,
        #    for this reason we introduce the parameter 'repeat_set'=num_captions, where num_captions is a list
        #    containing the number of captions in each video.

        num_captions_train = np.load(
            base_path + '/' + params['DESCRIPTION_COUNTS_FILES']['train'])
        num_captions_val = np.load(base_path + '/' +
                                   params['DESCRIPTION_COUNTS_FILES']['val'])
        num_captions_test = np.load(base_path + '/' +
                                    params['DESCRIPTION_COUNTS_FILES']['test'])

        for feat_type in params['FEATURE_NAMES']:
            for split, num_cap in zip(
                ['train', 'val', 'test'],
                [num_captions_train, num_captions_val, num_captions_test]):
                list_files = base_path + '/' + params['FRAMES_LIST_FILES'][
                    split] % feat_type
                counts_files = base_path + '/' + params['FRAMES_COUNTS_FILES'][
                    split] % feat_type

                ds.setInput([list_files, counts_files],
                            split,
                            type=params['INPUT_DATA_TYPE'],
                            id=params['INPUTS_IDS_DATASET'][0],
                            repeat_set=num_cap,
                            max_video_len=params['NUM_FRAMES'],
                            feat_len=params['IMG_FEAT_SIZE'])

        if len(params['INPUTS_IDS_DATASET']) > 1:
            ds.setInput(base_path + '/' + params['DESCRIPTION_FILES']['train'],
                        'train',
                        type='text',
                        id=params['INPUTS_IDS_DATASET'][-1],
                        required=False,
                        tokenization=params['TOKENIZATION_METHOD'],
                        pad_on_batch=True,
                        build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                        offset=1,
                        fill=params['FILL'],
                        max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                        max_words=params['OUTPUT_VOCABULARY_SIZE'],
                        min_occ=params['MIN_OCCURRENCES_VOCAB'])

            ds.setInput(None,
                        'val',
                        type='ghost',
                        id=params['INPUTS_IDS_DATASET'][-1],
                        required=False)
            ds.setInput(None,
                        'test',
                        type='ghost',
                        id=params['INPUTS_IDS_DATASET'][-1],
                        required=False)

        # Process dataset for keeping only one caption per video and storing the rest in a dict() with the following format:
        #        ds.extra_variables[set_name][id_output][img_position] = [cap1, cap2, cap3, ..., capN]
        keep_n_captions(ds,
                        repeat=[num_captions_val, num_captions_test],
                        n=1,
                        set_names=['val', 'test'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '.pkl')

    return ds
Пример #27
0
def build_dataset(params):
    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if (params['VERBOSE'] > 0):
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True
        base_path = params['DATA_ROOT_PATH'] + '/'
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)

        ##### OUTPUT DATA
        # Let's load the train, val and test splits of the target language sentences (outputs)
        #    the files include a sentence per line.
        print params['CLASS_FILES']
        for split in params['CLASS_FILES'].keys():
            ds.setOutput(params['CLASS_FILES'][split],
                         split,
                         type='categorical',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         sample_weights=params['SAMPLE_WEIGHTS'])

        # INPUT DATA
        for split in params['TEXT_FILES'].keys():
            if split == 'train':
                build_vocabulary = True
            else:
                build_vocabulary = False
            for i in range(len(params['INPUTS_IDS_DATASET'])):
                ds.setInput(params['TEXT_FILES'][split][i],
                            split,
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][i],
                            pad_on_batch=params['PAD_ON_BATCH'],
                            tokenization=params['TOKENIZATION_METHOD'],
                            build_vocabulary=build_vocabulary,
                            fill=params['FILL'],
                            max_text_len=params['MAX_INPUT_TEXT_LEN'],
                            max_words=params['INPUT_VOCABULARY_SIZE'],
                            min_occ=params['MIN_OCCURRENCES_VOCAB'])

        for i in range(len(params['INPUTS_IDS_DATASET'])):
            if 'semisupervised' in params['MODE']:
                ds.setInput(params['POOL_FILENAME'][i],
                            'test',
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][i],
                            pad_on_batch=params['PAD_ON_BATCH'],
                            tokenization=params['TOKENIZATION_METHOD'],
                            fill=params['FILL'],
                            max_text_len=params['MAX_INPUT_TEXT_LEN'],
                            max_words=params['INPUT_VOCABULARY_SIZE'],
                            min_occ=params['MIN_OCCURRENCES_VOCAB'])

        keep_n_captions(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])


    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '.pkl')

    return ds
Пример #28
0
def build_dataset(params):

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if (params['VERBOSE'] > 0):
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)

        ##### INPUT DATA
        # Let's load the images (inputs)

        ### IMAGES
        list_train = base_path + '/' + params['IMG_FILES']['train'][0]
        ds.setInput(list_train,
                    'train',
                    type='raw-image',
                    id=params['INPUTS_IDS_DATASET'][0],
                    img_size=params['IMG_SIZE'],
                    img_size_crop=params['IMG_CROP_SIZE'],
                    use_RGB=params['RGB'])
        if 'val' in params['IMG_FILES'] and params['IMG_FILES']['val']:
            list_val = base_path + '/' + params['IMG_FILES']['val'][0]
            ds.setInput(list_val,
                        'val',
                        type='raw-image',
                        id=params['INPUTS_IDS_DATASET'][0],
                        img_size=params['IMG_SIZE'],
                        img_size_crop=params['IMG_CROP_SIZE'],
                        use_RGB=params['RGB'])
        if 'test' in params['IMG_FILES'] and params['IMG_FILES']['test']:
            list_test = base_path + '/' + params['IMG_FILES']['test'][0]
            ds.setInput(list_test,
                        'test',
                        type='raw-image',
                        id=params['INPUTS_IDS_DATASET'][0],
                        img_size=params['IMG_SIZE'],
                        img_size_crop=params['IMG_CROP_SIZE'],
                        use_RGB=params['RGB'])

        # Train mean
        if params['MEAN_IMAGE']:
            # if params['NORMALIZE']:
            #    params['MEAN_IMAGE'] = [m / 255. for m in params['MEAN_IMAGE']]
            ds.setTrainMean(params['MEAN_IMAGE'],
                            params['INPUTS_IDS_DATASET'][0])
        else:
            ds.calculateTrainMean(params['INPUTS_IDS_DATASET'][0])

        ##### OUTPUT DATA
        if params['TYPE_OUT'] == '3DLabel':
            # Set list of classes (strings)
            ds.setClasses(base_path + '/' + params['CLASSES_PATH'],
                          params['OUTPUTS_IDS_DATASET'][0])
        elif params['TYPE_OUT'] == '3DSemanticLabel':
            # Set list of classes (strings)
            classes_names = []
            with open(base_path + '/' + params['CLASSES_PATH'], 'r') as file:
                for line in file:
                    line = line.rstrip('\n').split(',')[0]
                    classes_names.append(line)
            ds.setClasses(classes_names, params['OUTPUTS_IDS_DATASET'][0])
            ds.setSemanticClasses(base_path + '/' + params['CLASSES_PATH'],
                                  params['OUTPUTS_IDS_DATASET'][0])

        ### 3DLabels or 3DSemanticLabels
        ds.setOutput(base_path + '/' + params['IMG_FILES']['train'][1],
                     'train',
                     type=params['TYPE_OUT'],
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     associated_id_in=params['INPUTS_IDS_DATASET'][0],
                     num_poolings=params['NUM_MODEL_POOLINGS'])
        if 'val' in params['IMG_FILES'] and params['IMG_FILES']['val']:
            ds.setOutput(base_path + '/' + params['IMG_FILES']['val'][1],
                         'val',
                         type=params['TYPE_OUT'],
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         associated_id_in=params['INPUTS_IDS_DATASET'][0],
                         num_poolings=params['NUM_MODEL_POOLINGS'])
        if 'test' in params['IMG_FILES'] and params['IMG_FILES']['test']:
            ds.setOutput(base_path + '/' + params['IMG_FILES']['test'][1],
                         'test',
                         type=params['TYPE_OUT'],
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         associated_id_in=params['INPUTS_IDS_DATASET'][0],
                         num_poolings=params['NUM_MODEL_POOLINGS'])

        if params['DISCARD_CLASSES']:
            weights = np.ones((params['NUM_CLASSES'], ))
            for c in params['DISCARD_CLASSES']:
                weights[c] = 0.0
            ds.extra_variables['class_weights_' +
                               params['OUTPUTS_IDS_DATASET'][0]] = weights

        if params['WEIGHT_CLASSES']:
            weights = params['WEIGHT_CLASSES']
            ds.extra_variables['class_weights_' +
                               params['OUTPUTS_IDS_DATASET'][0]] = weights

        ### Single multi-label
        if params['APPLY_MULTILABEL_CLASSIFICATION']:
            n_classes = len(ds.classes[params['OUTPUTS_IDS_DATASET'][0]])
            multilabel = convert3DLabels2multilabel(
                base_path + '/' + params['IMG_FILES']['train'][1], n_classes)
            ds.setOutput(multilabel,
                         'train',
                         type='binary',
                         id=params['OUTPUTS_IDS_DATASET'][1])
            if 'val' in params['IMG_FILES'] and params['IMG_FILES']['val']:
                multilabel = convert3DLabels2multilabel(
                    base_path + '/' + params['IMG_FILES']['val'][1], n_classes)
                ds.setOutput(multilabel,
                             'val',
                             type='binary',
                             id=params['OUTPUTS_IDS_DATASET'][1])
            if 'test' in params['IMG_FILES'] and params['IMG_FILES']['test']:
                multilabel = convert3DLabels2multilabel(
                    base_path + '/' + params['IMG_FILES']['test'][1],
                    n_classes)
                ds.setOutput(multilabel,
                             'test',
                             type='binary',
                             id=params['OUTPUTS_IDS_DATASET'][1])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])
    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '.pkl')

    return ds
Пример #29
0
def apply_NMT_model(params, load_dataset=None):
    """
    Sample from a previously trained model.

    :param params: Dictionary of network hyperparameters.
    :param load_dataset: Load dataset from file or build it from the parameters.
    :return: None
    """

    # Load data
    if load_dataset is None:
        dataset = build_dataset(params)
    else:
        dataset = loadDataset(load_dataset)
    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]

    # Load model
    nmt_model = loadModel(params['STORE_PATH'],
                          params['RELOAD'],
                          reload_epoch=params['RELOAD_EPOCH'])

    # Evaluate training
    extra_vars = {
        'language': params.get('TRG_LAN', 'en'),
        'n_parallel_loaders': params['PARALLEL_LOADERS'],
        'tokenize_f': eval('dataset.' + params['TOKENIZATION_METHOD']),
        'detokenize_f': eval('dataset.' + params['DETOKENIZATION_METHOD']),
        'apply_detokenization': params['APPLY_DETOKENIZATION'],
        'tokenize_hypotheses': params['TOKENIZE_HYPOTHESES'],
        'tokenize_references': params['TOKENIZE_REFERENCES'],
    }

    input_text_id = params['INPUTS_IDS_DATASET'][0]
    vocab_x = dataset.vocabulary[input_text_id]['idx2words']
    vocab_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']
    if params['BEAM_SEARCH']:
        extra_vars['beam_size'] = params.get('BEAM_SIZE', 6)
        extra_vars['state_below_index'] = params.get('BEAM_SEARCH_COND_INPUT',
                                                     -1)
        extra_vars['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 30)
        extra_vars['optimized_search'] = params.get('OPTIMIZED_SEARCH', True)
        extra_vars['model_inputs'] = params['INPUTS_IDS_MODEL']
        extra_vars['model_outputs'] = params['OUTPUTS_IDS_MODEL']
        extra_vars['dataset_inputs'] = params['INPUTS_IDS_DATASET']
        extra_vars['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
        extra_vars['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False)
        extra_vars['search_pruning'] = params.get('SEARCH_PRUNING', False)
        extra_vars['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0)
        extra_vars['coverage_penalty'] = params.get('COVERAGE_PENALTY', False)
        extra_vars['length_penalty'] = params.get('LENGTH_PENALTY', False)
        extra_vars['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR',
                                                      0.0)
        extra_vars['coverage_norm_factor'] = params.get(
            'COVERAGE_NORM_FACTOR', 0.0)
        extra_vars['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \
            else params.get('MAX_OUTPUT_TEXT_LEN', 50)
        extra_vars['pos_unk'] = params['POS_UNK']
        extra_vars['output_max_length_depending_on_x'] = params.get(
            'MAXLEN_GIVEN_X', True)
        extra_vars['output_max_length_depending_on_x_factor'] = params.get(
            'MAXLEN_GIVEN_X_FACTOR', 3)
        extra_vars['output_min_length_depending_on_x'] = params.get(
            'MINLEN_GIVEN_X', True)
        extra_vars['output_min_length_depending_on_x_factor'] = params.get(
            'MINLEN_GIVEN_X_FACTOR', 2)
        extra_vars['attend_on_output'] = params.get(
            'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower())

        if params['POS_UNK']:
            extra_vars['heuristic'] = params['HEURISTIC']
            if params['HEURISTIC'] > 0:
                extra_vars['mapping'] = dataset.mapping

    for s in params["EVAL_ON_SETS"]:
        extra_vars[s] = dict()
        extra_vars[s]['references'] = dataset.extra_variables[s][
            params['OUTPUTS_IDS_DATASET'][0]]
        callback_metric = PrintPerformanceMetricOnEpochEndOrEachNUpdates(
            nmt_model,
            dataset,
            gt_id=params['OUTPUTS_IDS_DATASET'][0],
            metric_name=params['METRICS'],
            set_name=params['EVAL_ON_SETS'],
            batch_size=params['BATCH_SIZE'],
            each_n_epochs=params['EVAL_EACH'],
            extra_vars=extra_vars,
            reload_epoch=params['RELOAD'],
            is_text=True,
            input_text_id=input_text_id,
            save_path=nmt_model.model_path,
            index2word_y=vocab_y,
            index2word_x=vocab_x,
            sampling_type=params['SAMPLING'],
            beam_search=params['BEAM_SEARCH'],
            start_eval_on_epoch=params['START_EVAL_ON_EPOCH'],
            write_samples=True,
            write_type=params['SAMPLING_SAVE_MODE'],
            eval_on_epochs=params['EVAL_EACH_EPOCHS'],
            save_each_evaluation=False,
            verbose=params['VERBOSE'])

        callback_metric.evaluate(
            params['RELOAD'],
            counter_name='epoch' if params['EVAL_EACH_EPOCHS'] else 'update')
Пример #30
0
def build_dataset(params):

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if (params['VERBOSE'] > 0):
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME']
        ds = Dataset(name, base_path, silence=silence)
        max_text_len = params['MAX_INPUT_TEXT_LEN']

        ##### INPUT DATA
        ### QUESTIONS
        ds.setInput(base_path + '/' + params['QST_FILES']['train'][0],
                    'train',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][0],
                    tokenization=params['TOKENIZATION_METHOD'],
                    build_vocabulary=True,
                    fill=params['FILL'],
                    max_text_len=params['MAX_INPUT_TEXT_LEN'],
                    max_words=params['INPUT_VOCABULARY_SIZE'],
                    repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path + '/' + params['QST_FILES']['val'][0],
                    'val',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][0],
                    tokenization=params['TOKENIZATION_METHOD'],
                    fill=params['FILL'],
                    max_text_len=params['MAX_INPUT_TEXT_LEN'],
                    max_words=params['INPUT_VOCABULARY_SIZE'],
                    repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path + '/' + params['QST_FILES']['test'][0],
                    'test',
                    type='text',
                    id=params['INPUTS_IDS_DATASET'][0],
                    tokenization=params['TOKENIZATION_METHOD'],
                    fill=params['FILL'],
                    max_text_len=params['MAX_INPUT_TEXT_LEN'],
                    max_words=params['INPUT_VOCABULARY_SIZE'],
                    repeat_set=params['REPEAT_QST'])
        ### QUESTIONS' associated IDs
        ds.setInput(base_path + '/' + params['QST_FILES']['train'][1],
                    'train',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][0] + '_ids',
                    repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path + '/' + params['QST_FILES']['val'][1],
                    'val',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][0] + '_ids',
                    repeat_set=params['REPEAT_QST'])
        ds.setInput(base_path + '/' + params['QST_FILES']['test'][1],
                    'test',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][0] + '_ids',
                    repeat_set=params['REPEAT_QST'])

        ### IMAGES
        ds.setInput(base_path + '/' + params['IMG_FILES']['train'][0],
                    'train',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][1],
                    feat_len=params['IMG_FEAT_SIZE'],
                    repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path + '/' + params['IMG_FILES']['val'][0],
                    'val',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][1],
                    feat_len=params['IMG_FEAT_SIZE'],
                    repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path + '/' + params['IMG_FILES']['test'][0],
                    'test',
                    type='image-features',
                    id=params['INPUTS_IDS_DATASET'][1],
                    feat_len=params['IMG_FEAT_SIZE'],
                    repeat_set=params['REPEAT_IMG'])
        ### IMAGES' associated IDs
        ds.setInput(base_path + '/' + params['IMG_FILES']['train'][1],
                    'train',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][1] + '_ids',
                    repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path + '/' + params['IMG_FILES']['val'][1],
                    'val',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][1] + '_ids',
                    repeat_set=params['REPEAT_IMG'])
        ds.setInput(base_path + '/' + params['IMG_FILES']['test'][1],
                    'test',
                    type='id',
                    id=params['INPUTS_IDS_DATASET'][1] + '_ids',
                    repeat_set=params['REPEAT_IMG'])

        ##### OUTPUT DATA
        ### ANSWERS
        ds.setOutput(base_path + '/' + params['ANS_FILES']['train'][0],
                     'train',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     tokenization=params['TOKENIZATION_METHOD'],
                     build_vocabulary=True,
                     fill=params['FILL'],
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                     max_words=params['OUTPUT_VOCABULARY_SIZE'])
        ds.setOutput(base_path + '/' + params['ANS_FILES']['val'][0],
                     'val',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     tokenization=params['TOKENIZATION_METHOD'],
                     fill=params['FILL'],
                     max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                     max_words=params['OUTPUT_VOCABULARY_SIZE'])
        if 'test' in params['ANS_FILES']:
            ds.setOutput(base_path + '/' + params['ANS_FILES']['test'][0],
                         'test',
                         type='text',
                         id=params['OUTPUTS_IDS_DATASET'][0],
                         tokenization=params['TOKENIZATION_METHOD'],
                         fill=params['FILL'],
                         max_text_len=params['MAX_OUTPUT_TEXT_LEN'],
                         max_words=params['OUTPUT_VOCABULARY_SIZE'])

        # Load extra variables (we need the original path to questions and annotations for VQA evaluation)
        ds.extra_variables['train'] = dict()
        ds.extra_variables['val'] = dict()
        ds.extra_variables['test'] = dict()

        ds.extra_variables['train'][
            'quesFile'] = base_path + '/' + params['QST_FILES']['train'][2]
        ds.extra_variables['val'][
            'quesFile'] = base_path + '/' + params['QST_FILES']['val'][2]
        ds.extra_variables['test'][
            'quesFile'] = base_path + '/' + params['QST_FILES']['test'][2]

        ds.extra_variables['train'][
            'annFile'] = base_path + '/' + params['ANS_FILES']['train'][1]
        ds.extra_variables['val'][
            'annFile'] = base_path + '/' + params['ANS_FILES']['val'][1]
        if 'test' in params['ANS_FILES']:
            ds.extra_variables['test'][
                'annFile'] = base_path + '/' + params['ANS_FILES']['tes'][1]

        # Remove all samples of the train set not belonging to the top classes chosen
        if params['KEEP_TOP_ANSWERS']:
            ds.keepTopOutputs('train', params['OUTPUTS_IDS_DATASET'][0],
                              params['OUTPUT_VOCABULARY_SIZE'])
        # Filter top K answers per question-image pair
        if params['FILTER_ANSWERS']:
            filter_k_frequent_answers(ds, params)

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATA_ROOT_PATH'])

    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATA_ROOT_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '.pkl')

    return ds
Пример #31
0
    try:
        for arg in args.changes:
            try:
                k, v = arg.split('=')
            except ValueError:
                print 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str(
                    args.changes)
                exit(1)
            try:
                params[k] = ast.literal_eval(v)
            except ValueError:
                params[k] = v
    except ValueError:
        print 'Error processing arguments: (', k, ",", v, ")"
        exit(2)
    dataset = loadDataset(args.dataset)
    dataset = update_dataset_from_file(dataset,
                                       args.text,
                                       params,
                                       splits=args.splits,
                                       remove_outputs=True)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]
    # For converting predictions into sentences
    index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET']
                                      [0]]['idx2words']

    if params.get('APPLY_DETOKENIZATION', False):
def bpe_loading(args):
    logging.info("Using an ensemble of %d models" % len(args["models"]))
    models = [loadModel(m, -1, full_path=True) for m in args["models"]]
    dataset = loadDataset(args["dataset"])

    return models, dataset
Пример #33
0
def main():
    args = parse_args()
    server_address = (args.address, args.port)
    httpd = HTTPServer(server_address, NMTHandler)
    logger.setLevel(args.logging_level)
    parameters = load_parameters()
    if args.config is not None:
        logger.info("Loading parameters from %s" % str(args.config))
        parameters = update_parameters(parameters, pkl2dict(args.config))

    if args.online:
        online_parameters = load_parameters_online()
        parameters = update_parameters(parameters, online_parameters)

    try:
        for arg in args.changes:
            try:
                k, v = arg.split('=')
            except ValueError:
                print(
                    'Overwritten arguments must have the form key=Value. \n Currently are: %s'
                    % str(args.changes))
                exit(1)
            try:
                parameters[k] = ast.literal_eval(v)
            except ValueError:
                parameters[k] = v
    except ValueError:
        print('Error processing arguments: (', k, ",", v, ")")
        exit(2)
    dataset = loadDataset(args.dataset)

    # For converting predictions into sentences
    # Dataset backwards compatibility
    bpe_separator = dataset.BPE_separator if hasattr(
        dataset,
        "BPE_separator") and dataset.BPE_separator is not None else '@@'
    # Build BPE tokenizer if necessary
    if 'bpe' in parameters['TOKENIZATION_METHOD'].lower():
        logger.info('Building BPE')
        if not dataset.BPE_built:
            dataset.build_bpe(parameters.get(
                'BPE_CODES_PATH',
                parameters['DATA_ROOT_PATH'] + '/training_codes.joint'),
                              separator=bpe_separator)
    # Build tokenization function
    tokenize_f = eval('dataset.' +
                      parameters.get('TOKENIZATION_METHOD', 'tokenize_bpe'))
    detokenize_function = eval(
        'dataset.' + parameters.get('DETOKENIZATION_METHOD', 'detokenize_bpe'))
    dataset.build_moses_tokenizer(language=parameters['SRC_LAN'])
    dataset.build_moses_detokenizer(language=parameters['TRG_LAN'])
    tokenize_general = dataset.tokenize_moses
    detokenize_general = dataset.detokenize_moses

    # Prediction parameters
    params_prediction = dict()
    params_prediction['max_batch_size'] = parameters.get('BATCH_SIZE', 20)
    params_prediction['n_parallel_loaders'] = parameters.get(
        'PARALLEL_LOADERS', 1)
    params_prediction['beam_size'] = parameters.get('BEAM_SIZE', 6)
    params_prediction['maxlen'] = parameters.get('MAX_OUTPUT_TEXT_LEN_TEST',
                                                 100)
    params_prediction['optimized_search'] = parameters['OPTIMIZED_SEARCH']
    params_prediction['model_inputs'] = parameters['INPUTS_IDS_MODEL']
    params_prediction['model_outputs'] = parameters['OUTPUTS_IDS_MODEL']
    params_prediction['dataset_inputs'] = parameters['INPUTS_IDS_DATASET']
    params_prediction['dataset_outputs'] = parameters['OUTPUTS_IDS_DATASET']
    params_prediction['search_pruning'] = parameters.get(
        'SEARCH_PRUNING', False)
    params_prediction['normalize_probs'] = True
    params_prediction['alpha_factor'] = parameters.get('ALPHA_FACTOR', 1.0)
    params_prediction['coverage_penalty'] = True
    params_prediction['length_penalty'] = True
    params_prediction['length_norm_factor'] = parameters.get(
        'LENGTH_NORM_FACTOR', 0.0)
    params_prediction['coverage_norm_factor'] = parameters.get(
        'COVERAGE_NORM_FACTOR', 0.0)
    params_prediction['pos_unk'] = parameters.get('POS_UNK', False)
    params_prediction['heuristic'] = parameters.get('HEURISTIC', 0)
    params_prediction['state_below_index'] = -1
    params_prediction['output_text_index'] = 0
    params_prediction['state_below_maxlen'] = -1 if parameters.get(
        'PAD_ON_BATCH', True) else parameters.get('MAX_OUTPUT_TEXT_LEN', 50)
    params_prediction['output_max_length_depending_on_x'] = parameters.get(
        'MAXLEN_GIVEN_X', True)
    params_prediction[
        'output_max_length_depending_on_x_factor'] = parameters.get(
            'MAXLEN_GIVEN_X_FACTOR', 3)
    params_prediction['output_min_length_depending_on_x'] = parameters.get(
        'MINLEN_GIVEN_X', True)
    params_prediction[
        'output_min_length_depending_on_x_factor'] = parameters.get(
            'MINLEN_GIVEN_X_FACTOR', 2)
    params_prediction['attend_on_output'] = parameters.get(
        'ATTEND_ON_OUTPUT', 'transformer' in parameters['MODEL_TYPE'].lower())

    # Manage pos_unk strategies
    if parameters['POS_UNK']:
        mapping = None if dataset.mapping == dict() else dataset.mapping
    else:
        mapping = None

    if 'transformer' in parameters['MODEL_TYPE'].lower():
        params_prediction['pos_unk'] = False
        params_prediction['coverage_penalty'] = False

    # Training parameters
    parameters_training = dict()
    if args.online:
        logger.info('Loading models from %s' % str(args.models))
        parameters_training = {  # Traning parameters
            'n_epochs': parameters['MAX_EPOCH'],
            'shuffle': False,
            'loss': parameters.get('LOSS', 'categorical_crossentropy'),
            'batch_size': parameters.get('BATCH_SIZE', 1),
            'homogeneous_batches': False,
            'optimizer': parameters.get('OPTIMIZER', 'SGD'),
            'lr': parameters.get('LR', 0.1),
            'lr_decay': parameters.get('LR_DECAY', None),
            'lr_gamma': parameters.get('LR_GAMMA', 1.),
            'epochs_for_save': -1,
            'verbose': args.verbose,
            'eval_on_sets': parameters.get('EVAL_ON_SETS_KERAS', None),
            'n_parallel_loaders': parameters['PARALLEL_LOADERS'],
            'extra_callbacks': [],  # callbacks,
            'reload_epoch': parameters['RELOAD'],
            'epoch_offset': parameters['RELOAD'],
            'data_augmentation': parameters['DATA_AUGMENTATION'],
            'patience': parameters.get('PATIENCE', 0),
            'metric_check': parameters.get('STOP_METRIC', None),
            'eval_on_epochs': parameters.get('EVAL_EACH_EPOCHS', True),
            'each_n_epochs': parameters.get('EVAL_EACH', 1),
            'start_eval_on_epoch': parameters.get('START_EVAL_ON_EPOCH', 0),
            'additional_training_settings': {
                'k': parameters.get('K', 1),
                'tau': parameters.get('TAU', 1),
                'lambda': parameters.get('LAMBDA', 0.5),
                'c': parameters.get('C', 0.5),
                'd': parameters.get('D', 0.5)
            }
        }
        model_instances = [
            TranslationModel(
                parameters,
                model_type=parameters['MODEL_TYPE'],
                verbose=parameters['VERBOSE'],
                model_name=parameters['MODEL_NAME'] + '_' + str(i),
                vocabularies=dataset.vocabulary,
                store_path=parameters['STORE_PATH'],
                set_optimizer=False) for i in range(len(args.models))
        ]
        models = [
            updateModel(model, path, -1, full_path=True)
            for (model, path) in zip(model_instances, args.models)
        ]
    else:
        models = [loadModel(m, -1, full_path=True) for m in args.models]

    for nmt_model in models:
        nmt_model.setParams(parameters)
        nmt_model.setOptimizer()

    parameters['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        parameters['INPUTS_IDS_DATASET'][0]]
    parameters['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        parameters['OUTPUTS_IDS_DATASET'][0]]

    # Get word2index and index2word dictionaries
    index2word_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET']
                                      [0]]['idx2words']
    word2index_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET']
                                      [0]]['words2idx']
    index2word_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET']
                                      [0]]['idx2words']
    word2index_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET']
                                      [0]]['words2idx']

    excluded_words = None
    interactive_beam_searcher = NMTSampler(models,
                                           dataset,
                                           parameters,
                                           params_prediction,
                                           parameters_training,
                                           tokenize_f,
                                           detokenize_function,
                                           tokenize_general,
                                           detokenize_general,
                                           mapping=mapping,
                                           word2index_x=word2index_x,
                                           word2index_y=word2index_y,
                                           index2word_y=index2word_y,
                                           eos_symbol=args.eos_symbol,
                                           excluded_words=excluded_words,
                                           online=args.online,
                                           verbose=args.verbose)

    httpd.sampler = interactive_beam_searcher

    logger.info('Server starting at %s' % str(server_address))
    httpd.serve_forever()
Пример #34
0
def sample_ensemble(args, params):
    """
    Use several translation models for obtaining predictions from a source text file.

    :param argparse.Namespace args: Arguments given to the method:

                      * dataset: Dataset instance with data.
                      * text: Text file with source sentences.
                      * splits: Splits to sample. Should be already included in the dataset object.
                      * dest: Output file to save scores.
                      * weights: Weight given to each model in the ensemble. You should provide the same number of weights than models. By default, it applies the same weight to each model (1/N).
                      * n_best: Write n-best list (n = beam size).
                      * config: Config .pkl for loading the model configuration. If not specified, hyperparameters are read from config.py.
                      * models: Path to the models.
                      * verbose: Be verbose or not.

    :param params: parameters of the translation model.
    """
    from data_engine.prepare_data import update_dataset_from_file
    from keras_wrapper.model_ensemble import BeamSearchEnsemble
    from keras_wrapper.cnn_model import loadModel
    from keras_wrapper.dataset import loadDataset
    from keras_wrapper.utils import decode_predictions_beam_search

    logger.info("Using an ensemble of %d models" % len(args.models))
    models = [loadModel(m, -1, full_path=True) for m in args.models]
    dataset = loadDataset(args.dataset)
    dataset = update_dataset_from_file(dataset, args.text, params, splits=args.splits, remove_outputs=True)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]
    # For converting predictions into sentences
    index2word_y = dataset.vocabulary[params['OUTPUTS_IDS_DATASET'][0]]['idx2words']

    if params.get('APPLY_DETOKENIZATION', False):
        detokenize_function = eval('dataset.' + params['DETOKENIZATION_METHOD'])

    params_prediction = dict()
    params_prediction['max_batch_size'] = params.get('BATCH_SIZE', 20)
    params_prediction['n_parallel_loaders'] = params.get('PARALLEL_LOADERS', 1)
    params_prediction['beam_size'] = params.get('BEAM_SIZE', 6)
    params_prediction['maxlen'] = params.get('MAX_OUTPUT_TEXT_LEN_TEST', 100)
    params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH']
    params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL']
    params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL']
    params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET']
    params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET']
    params_prediction['search_pruning'] = params.get('SEARCH_PRUNING', False)
    params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False)
    params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0)
    params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False)
    params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False)
    params_prediction['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0)
    params_prediction['coverage_norm_factor'] = params.get('COVERAGE_NORM_FACTOR', 0.0)
    params_prediction['pos_unk'] = params.get('POS_UNK', False)
    params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \
        else params.get('MAX_OUTPUT_TEXT_LEN', 50)
    params_prediction['output_max_length_depending_on_x'] = params.get('MAXLEN_GIVEN_X', True)
    params_prediction['output_max_length_depending_on_x_factor'] = params.get('MAXLEN_GIVEN_X_FACTOR', 3)
    params_prediction['output_min_length_depending_on_x'] = params.get('MINLEN_GIVEN_X', True)
    params_prediction['output_min_length_depending_on_x_factor'] = params.get('MINLEN_GIVEN_X_FACTOR', 2)
    params_prediction['attend_on_output'] = params.get('ATTEND_ON_OUTPUT',
                                                       'transformer' in params['MODEL_TYPE'].lower())
    params_prediction['glossary'] = params.get('GLOSSARY', None)

    heuristic = params.get('HEURISTIC', 0)
    mapping = None if dataset.mapping == dict() else dataset.mapping
    model_weights = args.weights

    if args.glossary is not None:
        glossary = pkl2dict(args.glossary)
    elif params_prediction['glossary'] is not None:
        glossary = pkl2dict(params_prediction['glossary'])
    else:
        glossary = None

    if model_weights:
        assert len(model_weights) == len(
            models), 'You should give a weight to each model. You gave %d models and %d weights.' % (
            len(models), len(model_weights))
        model_weights = list(map(float, model_weights))
        if len(model_weights) > 1:
            logger.info('Giving the following weights to each model: %s' % str(model_weights))

    for s in args.splits:
        # Apply model predictions
        params_prediction['predict_on_sets'] = [s]
        beam_searcher = BeamSearchEnsemble(models,
                                           dataset,
                                           params_prediction,
                                           model_weights=model_weights,
                                           n_best=args.n_best,
                                           verbose=args.verbose)
        predictions = beam_searcher.predictBeamSearchNet()[s]
        samples = predictions['samples']
        alphas = predictions['alphas'] if params_prediction['pos_unk'] else None

        if params_prediction['pos_unk']:
            sources = [x.strip() for x in open(args.text, 'r').read().split('\n')]
            sources = sources[:-1] if len(sources[-1]) == 0 else sources
        else:
            sources = None

        decoded_predictions = decode_predictions_beam_search(samples,
                                                             index2word_y,
                                                             glossary=glossary,
                                                             alphas=alphas,
                                                             x_text=sources,
                                                             heuristic=heuristic,
                                                             mapping=mapping,
                                                             verbose=args.verbose)
        # Apply detokenization function if needed
        if params.get('APPLY_DETOKENIZATION', False):
            decoded_predictions = list(map(detokenize_function, decoded_predictions))

        if args.n_best:
            n_best_predictions = []
            for i, (n_best_preds, n_best_scores, n_best_alphas) in enumerate(predictions['n_best']):
                n_best_sample_score = []
                for n_best_pred, n_best_score, n_best_alpha in zip(n_best_preds, n_best_scores, n_best_alphas):
                    pred = decode_predictions_beam_search([n_best_pred],
                                                          index2word_y,
                                                          glossary=glossary,
                                                          alphas=[n_best_alpha] if params_prediction[
                                                              'pos_unk'] else None,
                                                          x_text=[sources[i]] if params_prediction['pos_unk'] else None,
                                                          heuristic=heuristic,
                                                          mapping=mapping,
                                                          verbose=args.verbose)
                    # Apply detokenization function if needed
                    if params.get('APPLY_DETOKENIZATION', False):
                        pred = list(map(detokenize_function, pred))

                    n_best_sample_score.append([i, pred, n_best_score])
                n_best_predictions.append(n_best_sample_score)
        # Store result
        if args.dest is not None:
            filepath = args.dest  # results file
            if params.get('SAMPLING_SAVE_MODE', 'list'):
                list2file(filepath, decoded_predictions)
                if args.n_best:
                    nbest2file(filepath + '.nbest', n_best_predictions)
            else:
                raise Exception('Only "list" is allowed in "SAMPLING_SAVE_MODE"')
        else:
            list2stdout(decoded_predictions)
            if args.n_best:
                logger.info('Storing n-best sentences in ./' + s + '.nbest')
                nbest2file('./' + s + '.nbest', n_best_predictions)
        logger.info('Sampling finished')
Пример #35
0
def build_dataset(params):
    """
    Builds (or loads) a Dataset instance.
    :param params: Parameters specifying Dataset options
    :return: Dataset object
    """

    if params['REBUILD_DATASET']:  # We build a new dataset instance
        if params['VERBOSE'] > 0:
            silence = False
            logging.info('Building ' + params['DATASET_NAME'] + '_' +
                         params['SRC_LAN'] + params['TRG_LAN'] + ' dataset')
        else:
            silence = True

        base_path = params['DATA_ROOT_PATH']
        name = params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params[
            'TRG_LAN']
        ds = Dataset(name, base_path, silence=silence)

        # OUTPUT DATA
        # Let's load the train, val and test splits of the target language sentences (outputs)
        #    the files include a sentence per line.
        ds.setOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                     params['TRG_LAN'],
                     'train',
                     type='text',
                     id=params['OUTPUTS_IDS_DATASET'][0],
                     tokenization=params.get('TOKENIZATION_METHOD',
                                             'tokenize_none'),
                     build_vocabulary=True,
                     pad_on_batch=params.get('PAD_ON_BATCH', True),
                     sample_weights=params.get('SAMPLE_WEIGHTS', True),
                     fill=params.get('FILL', 'end'),
                     max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                     max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0),
                     min_occ=params.get('MIN_OCCURRENCES_OUTPUT_VOCAB', 0))
        if params.get('ALIGN_FROM_RAW',
                      True) and not params.get('HOMOGENEOUS_BATCHES', False):
            ds.setRawOutput(base_path + '/' + params['TEXT_FILES']['train'] +
                            params['TRG_LAN'],
                            'train',
                            type='file-name',
                            id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        for split in ['val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                ds.setOutput(base_path + '/' + params['TEXT_FILES'][split] +
                             params['TRG_LAN'],
                             split,
                             type='text',
                             id=params['OUTPUTS_IDS_DATASET'][0],
                             pad_on_batch=params.get('PAD_ON_BATCH', True),
                             tokenization=params.get('TOKENIZATION_METHOD',
                                                     'tokenize_none'),
                             sample_weights=params.get('SAMPLE_WEIGHTS', True),
                             max_text_len=params.get('MAX_OUTPUT_TEXT_LEN',
                                                     70),
                             max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0))
                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawOutput(
                        base_path + '/' + params['TEXT_FILES'][split] +
                        params['TRG_LAN'],
                        split,
                        type='file-name',
                        id='raw_' + params['OUTPUTS_IDS_DATASET'][0])

        # INPUT DATA
        # We must ensure that the 'train' split is the first (for building the vocabulary)
        for split in ['train', 'val', 'test']:
            if params['TEXT_FILES'].get(split) is not None:
                if split == 'train':
                    build_vocabulary = True
                else:
                    build_vocabulary = False
                ds.setInput(base_path + '/' + params['TEXT_FILES'][split] +
                            params['SRC_LAN'],
                            split,
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][0],
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            build_vocabulary=build_vocabulary,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_INPUT_TEXT_LEN', 70),
                            max_words=params.get('INPUT_VOCABULARY_SIZE', 0),
                            min_occ=params.get('MIN_OCCURRENCES_INPUT_VOCAB',
                                               0))

                if len(params['INPUTS_IDS_DATASET']) > 1:
                    if 'train' in split:
                        ds.setInput(
                            base_path + '/' + params['TEXT_FILES'][split] +
                            params['TRG_LAN'],
                            split,
                            type='text',
                            id=params['INPUTS_IDS_DATASET'][1],
                            required=False,
                            tokenization=params.get('TOKENIZATION_METHOD',
                                                    'tokenize_none'),
                            pad_on_batch=params.get('PAD_ON_BATCH', True),
                            build_vocabulary=params['OUTPUTS_IDS_DATASET'][0],
                            offset=1,
                            fill=params.get('FILL', 'end'),
                            max_text_len=params.get('MAX_OUTPUT_TEXT_LEN', 70),
                            max_words=params.get('OUTPUT_VOCABULARY_SIZE', 0))
                    else:
                        ds.setInput(None,
                                    split,
                                    type='ghost',
                                    id=params['INPUTS_IDS_DATASET'][-1],
                                    required=False)
                if params.get('ALIGN_FROM_RAW', True) and not params.get(
                        'HOMOGENEOUS_BATCHES', False):
                    ds.setRawInput(base_path + '/' +
                                   params['TEXT_FILES'][split] +
                                   params['SRC_LAN'],
                                   split,
                                   type='file-name',
                                   id='raw_' + params['INPUTS_IDS_DATASET'][0])

        if params.get('POS_UNK', False):
            if params.get('HEURISTIC', 0) > 0:
                ds.loadMapping(params['MAPPING'])

        # If we had multiple references per sentence
        keep_n_captions(ds, repeat=1, n=1, set_names=params['EVAL_ON_SETS'])

        # We have finished loading the dataset, now we can store it for using it in the future
        saveDataset(ds, params['DATASET_STORE_PATH'])

    else:
        # We can easily recover it with a single line
        ds = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' +
                         params['DATASET_NAME'] + '_' + params['SRC_LAN'] +
                         params['TRG_LAN'] + '.pkl')

    return ds

def parse_args():
    parser = argparse.ArgumentParser("Minimizes a dataset by removing the data stored in it: Tranining, development and test. "
                                     "The rest of parameters are kept."
                                     "Useful for reloading datasets with new data.")
    parser.add_argument("-d", "--dataset", required=True, help="Stored instance of the dataset")
    parser.add_argument("-o", "--output", help="Output dataset file.",
                        default="")
    return parser.parse_args()

if __name__ == "__main__":

    args = parse_args()
    # Load dataset
    ds = loadDataset(args.dataset)
    # Reinitialize values to empty
    ds.loaded_train = [False, False]
    ds.loaded_val = [False, False]
    ds.loaded_test = [False, False]

    ds.loaded_raw_train = [False, False]
    ds.loaded_raw_val = [False, False]
    ds.loaded_raw_test = [False, False]

    ds.len_train = 0
    ds.len_val = 0
    ds.len_test = 0
    # Remove data
    for key in ds.X_train.keys():
        ds.X_train[key] = None