예제 #1
0
def build_glossary(glossary_text_file, dest_filename, separator='\t'):
    """
    Preprocess a glossary file with the format
        word <separator> desired_replacement
    and stores them in a suitable format (.pkl)

    :param glossary_text_file: Path to the glossary file.
    :param dest_filename: Output filename.
    :param separator: Separator between words and replacements
    """
    glossary = dict()
    print("Reading glossary from %s" % glossary_text_file)
    for glossary_line in open(glossary_text_file).read().splitlines():
        split_line = glossary_line.split(separator)
        glossary[split_line[0]] = ' '.join(split_line[1:])
    print("Done. Saving glossary into %s" % dest_filename)
    dict2pkl(glossary, dest_filename)
예제 #2
0
def train_model(params):
    """
    Training function. Sets the training parameters from params. Build or loads the model and launches the training.
    :param params: Dictionary of network hyperparameters.
    :return: None
    """

    if params['RELOAD'] > 0:
        logging.info('Resuming training.')

    check_params(params)

    # Load data
    dataset = build_dataset(params)
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]

    # Build model
    if (params['RELOAD'] == 0):  # build new model
        video_model = VideoDesc_Model(params,
                                      type=params['MODEL_TYPE'],
                                      verbose=params['VERBOSE'],
                                      model_name=params['MODEL_NAME'],
                                      vocabularies=dataset.vocabulary,
                                      store_path=params['STORE_PATH'])
        dict2pkl(params, params['STORE_PATH'] + '/config')

        # Define the inputs and outputs mapping from our Dataset instance to our model
        inputMapping = dict()
        for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
            if len(video_model.ids_inputs) > i:
                pos_source = dataset.ids_inputs.index(id_in)
                id_dest = video_model.ids_inputs[i]
                inputMapping[id_dest] = pos_source
        video_model.setInputsMapping(inputMapping)

        outputMapping = dict()
        for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
            if len(video_model.ids_outputs) > i:
                pos_target = dataset.ids_outputs.index(id_out)
                id_dest = video_model.ids_outputs[i]
                outputMapping[id_dest] = pos_target
        video_model.setOutputsMapping(outputMapping)

    else:  # resume from previously trained model
        video_model = loadModel(params['STORE_PATH'], params['RELOAD'])
        video_model.setOptimizer()
    ###########

    ########### Callbacks
    callbacks = buildCallbacks(params, video_model, dataset)
    ###########

    ########### Training
    total_start_time = timer()

    logger.debug('Starting training!')
    training_params = {
        'n_epochs': params['MAX_EPOCH'],
        'batch_size': params['BATCH_SIZE'],
        'homogeneous_batches': params['HOMOGENEOUS_BATCHES'],
        'maxlen': params['MAX_OUTPUT_TEXT_LEN'],
        'lr_decay': params['LR_DECAY'],
        'lr_gamma': params['LR_GAMMA'],
        'epochs_for_save': params['EPOCHS_FOR_SAVE'],
        'verbose': params['VERBOSE'],
        'eval_on_sets': params['EVAL_ON_SETS_KERAS'],
        'n_parallel_loaders': params['PARALLEL_LOADERS'],
        'extra_callbacks': callbacks,
        'reload_epoch': params['RELOAD'],
        'epoch_offset': params['RELOAD'],
        'data_augmentation': params['DATA_AUGMENTATION'],
        'patience': params.get('PATIENCE', 0),
        'metric_check': params.get('STOP_METRIC', None)
    }
    video_model.trainNet(dataset, training_params)

    total_end_time = timer()
    time_difference = total_end_time - total_start_time
    logging.info('In total is {0:.2f}s = {1:.2f}m'.format(
        time_difference, time_difference / 60.0))
예제 #3
0
            i += 1
            if (i % 1000) == 0 and args.verbose > 0:
                print i
            if cur_source != -1:
                d[cur_source] = tmp_dict  # Set dict for previous word
            cur_source = line[0]
            tmp_dict = dict()
            tmp_dict[line[1]] = pow(np.e, float(line[2]))
        else:
            tmp_dict[line[1]] = pow(np.e, float(line[2]))
d[cur_source] = tmp_dict
del tmp_dict

e = {}
j = 0
for elt in d:
    if (j % 1000) == 0 and args.verbose > 0:
        print j
    j += 1
    e[elt] = sorted(d[elt], key=d[elt].get)[::-1]

f1 = {}
j = 0
for elt in e:
    if (j % 1000) == 0 and args.verbose > 0:
        print j
    j += 1
    f1[elt] = e[elt][0]

dict2pkl(f1, args.dest)
예제 #4
0
def train_model(params, load_dataset=None):
    """
    Training function.

    Sets the training parameters from params.

    Build or loads the model and launches the training.

    :param dict params: Dictionary of network hyperparameters.
    :param str load_dataset: Load dataset from file or build it from the parameters.
    :return: None
    """

    if params['RELOAD'] > 0:
        logger.info('Resuming training.')
        # Load data
        if load_dataset is None:
            if params['REBUILD_DATASET']:
                logger.info('Rebuilding dataset.')
                dataset = build_dataset(params)
            else:
                logger.info('Updating dataset.')
                dataset = loadDataset(params['DATASET_STORE_PATH'] +
                                      '/Dataset_' + params['DATASET_NAME'] +
                                      '_' + params['SRC_LAN'] +
                                      params['TRG_LAN'] + '.pkl')

                epoch_offset = 0 if dataset.len_train == 0 else int(
                    params['RELOAD'] * params['BATCH_SIZE'] /
                    dataset.len_train)
                params['EPOCH_OFFSET'] = params['RELOAD'] if params[
                    'RELOAD_EPOCH'] else epoch_offset

                for split, filename in iteritems(params['TEXT_FILES']):
                    dataset = update_dataset_from_file(
                        dataset,
                        params['DATA_ROOT_PATH'] + '/' + filename +
                        params['SRC_LAN'],
                        params,
                        splits=list([split]),
                        output_text_filename=params['DATA_ROOT_PATH'] + '/' +
                        filename + params['TRG_LAN'],
                        remove_outputs=False,
                        compute_state_below=True,
                        recompute_references=True)
                    dataset.name = params['DATASET_NAME'] + '_' + params[
                        'SRC_LAN'] + params['TRG_LAN']
                saveDataset(dataset, params['DATASET_STORE_PATH'])

        else:
            logger.info('Reloading and using dataset.')
            dataset = loadDataset(load_dataset)
    else:
        # Load data
        if load_dataset is None:
            dataset = build_dataset(params)
        else:
            dataset = loadDataset(load_dataset)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]

    # Build model
    set_optimizer = True if params['RELOAD'] == 0 else False
    clear_dirs = True if params['RELOAD'] == 0 else False

    # build new model
    nmt_model = TranslationModel(params,
                                 model_type=params['MODEL_TYPE'],
                                 verbose=params['VERBOSE'],
                                 model_name=params['MODEL_NAME'],
                                 vocabularies=dataset.vocabulary,
                                 store_path=params['STORE_PATH'],
                                 set_optimizer=set_optimizer,
                                 clear_dirs=clear_dirs)

    # Define the inputs and outputs mapping from our Dataset instance to our model
    inputMapping = dict()
    for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
        pos_source = dataset.ids_inputs.index(id_in)
        id_dest = nmt_model.ids_inputs[i]
        inputMapping[id_dest] = pos_source
    nmt_model.setInputsMapping(inputMapping)

    outputMapping = dict()
    for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
        pos_target = dataset.ids_outputs.index(id_out)
        id_dest = nmt_model.ids_outputs[i]
        outputMapping[id_dest] = pos_target
    nmt_model.setOutputsMapping(outputMapping)

    if params['RELOAD'] > 0:
        nmt_model = updateModel(nmt_model,
                                params['STORE_PATH'],
                                params['RELOAD'],
                                reload_epoch=params['RELOAD_EPOCH'])
        nmt_model.setParams(params)
        nmt_model.setOptimizer()
        if params.get('EPOCH_OFFSET') is None:
            params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \
                int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train)

    # Store configuration as pkl
    dict2pkl(params, params['STORE_PATH'] + '/config')

    # Callbacks
    callbacks = buildCallbacks(params, nmt_model, dataset)

    # Training
    total_start_time = timer()

    logger.debug('Starting training!')
    training_params = {
        'n_epochs':
        params['MAX_EPOCH'],
        'batch_size':
        params['BATCH_SIZE'],
        'homogeneous_batches':
        params['HOMOGENEOUS_BATCHES'],
        'maxlen':
        params['MAX_OUTPUT_TEXT_LEN'],
        'joint_batches':
        params['JOINT_BATCHES'],
        'lr_decay':
        params.get('LR_DECAY', None),  # LR decay parameters
        'initial_lr':
        params.get('LR', 1.0),
        'reduce_each_epochs':
        params.get('LR_REDUCE_EACH_EPOCHS', True),
        'start_reduction_on_epoch':
        params.get('LR_START_REDUCTION_ON_EPOCH', 0),
        'lr_gamma':
        params.get('LR_GAMMA', 0.9),
        'lr_reducer_type':
        params.get('LR_REDUCER_TYPE', 'linear'),
        'lr_reducer_exp_base':
        params.get('LR_REDUCER_EXP_BASE', 0),
        'lr_half_life':
        params.get('LR_HALF_LIFE', 50000),
        'lr_warmup_exp':
        params.get('WARMUP_EXP', -1.5),
        'min_lr':
        params.get('MIN_LR', 1e-9),
        'epochs_for_save':
        params['EPOCHS_FOR_SAVE'],
        'verbose':
        params['VERBOSE'],
        'eval_on_sets':
        params['EVAL_ON_SETS_KERAS'],
        'n_parallel_loaders':
        params['PARALLEL_LOADERS'],
        'extra_callbacks':
        callbacks,
        'reload_epoch':
        params['RELOAD'],
        'epoch_offset':
        params.get('EPOCH_OFFSET', 0),
        'data_augmentation':
        params['DATA_AUGMENTATION'],
        'patience':
        params.get('PATIENCE', 0),  # early stopping parameters
        'metric_check':
        params.get('STOP_METRIC', None)
        if params.get('EARLY_STOP', False) else None,
        'eval_on_epochs':
        params.get('EVAL_EACH_EPOCHS', True),
        'each_n_epochs':
        params.get('EVAL_EACH', 1),
        'start_eval_on_epoch':
        params.get('START_EVAL_ON_EPOCH', 0),
        'tensorboard':
        params.get('TENSORBOARD', False),
        'n_gpus':
        params.get('N_GPUS', 1),
        'tensorboard_params': {
            'log_dir':
            params.get('LOG_DIR', 'tensorboard_logs'),
            'histogram_freq':
            params.get('HISTOGRAM_FREQ', 0),
            'batch_size':
            params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']),
            'write_graph':
            params.get('WRITE_GRAPH', True),
            'write_grads':
            params.get('WRITE_GRADS', False),
            'write_images':
            params.get('WRITE_IMAGES', False),
            'embeddings_freq':
            params.get('EMBEDDINGS_FREQ', 0),
            'embeddings_layer_names':
            params.get('EMBEDDINGS_LAYER_NAMES', None),
            'embeddings_metadata':
            params.get('EMBEDDINGS_METADATA', None),
            'label_word_embeddings_with_vocab':
            params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False),
            'word_embeddings_labels':
            params.get('WORD_EMBEDDINGS_LABELS', None),
        }
    }
    nmt_model.trainNet(dataset, training_params)

    total_end_time = timer()
    time_difference = total_end_time - total_start_time
    logger.info('In total is {0:.2f}s = {1:.2f}m'.format(
        time_difference, time_difference / 60.0))
예제 #5
0
def train_model(params):
    """
    Training function. Sets the training parameters from params. Build or loads the model and launches the training.
    :param params: Dictionary of network hyperparameters.
    :return: None
    """

    if params['RELOAD'] > 0:
        logging.info('Resuming training.')

    # Load data
    dataset = build_dataset(params)
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]

    # Build model
    if (params['RELOAD'] == 0):  # build new model
        video_model = Captioning_Model(params,
                                       model_type=params['MODEL_TYPE'],
                                       verbose=params['VERBOSE'],
                                       model_name=params['MODEL_NAME'],
                                       vocabularies=dataset.vocabulary,
                                       store_path=params['STORE_PATH'])
        dict2pkl(params, params['STORE_PATH'] + '/config')

        # Define the inputs and outputs mapping from our Dataset instance to our model
        inputMapping = dict()
        for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
            if len(video_model.ids_inputs) > i:
                pos_source = dataset.ids_inputs.index(id_in)
                id_dest = video_model.ids_inputs[i]
                inputMapping[id_dest] = pos_source
        video_model.setInputsMapping(inputMapping)

        outputMapping = dict()
        for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
            if len(video_model.ids_outputs) > i:
                pos_target = dataset.ids_outputs.index(id_out)
                id_dest = video_model.ids_outputs[i]
                outputMapping[id_dest] = pos_target
        video_model.setOutputsMapping(outputMapping)

    else:  # resume from previously trained model
        video_model = loadModel(params['STORE_PATH'], params['RELOAD'])
        video_model.setOptimizer()

    # Callbacks
    callbacks = buildCallbacks(params, video_model, dataset)

    # Training
    total_start_time = timer()

    logger.debug('Starting training!')
    training_params = {'n_epochs': params['MAX_EPOCH'],
                       'batch_size': params['BATCH_SIZE'],
                       'homogeneous_batches': params['HOMOGENEOUS_BATCHES'],
                       'maxlen': params['MAX_OUTPUT_TEXT_LEN'],
                       'joint_batches': params['JOINT_BATCHES'],
                       'lr_decay': params.get('LR_DECAY', None),  # LR decay parameters
                       'initial_lr': params.get('LR', 1.0),
                       'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True),
                       'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0),
                       'lr_gamma': params.get('LR_GAMMA', 0.9),
                       'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'),
                       'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0),
                       'lr_half_life': params.get('LR_HALF_LIFE', 50000),
                       'lr_warmup_exp': params.get('WARMUP_EXP', -1.5),
                       'min_lr': params.get('MIN_LR', 1e-9),
                       'epochs_for_save': params['EPOCHS_FOR_SAVE'],
                       'verbose': params['VERBOSE'],
                       'eval_on_sets': params['EVAL_ON_SETS_KERAS'],
                       'n_parallel_loaders': params['PARALLEL_LOADERS'],
                       'extra_callbacks': callbacks,
                       'reload_epoch': params['RELOAD'],
                       'epoch_offset': params.get('EPOCH_OFFSET', 0),
                       'data_augmentation': params['DATA_AUGMENTATION'],
                       'patience': params.get('PATIENCE', 0),  # early stopping parameters
                       'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None,
                       'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True),
                       'each_n_epochs': params.get('EVAL_EACH', 1),
                       'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0),
                       'tensorboard': params.get('TENSORBOARD', False),
                       'n_gpus': params.get('N_GPUS', 1),
                       'tensorboard_params': {'log_dir': params.get('LOG_DIR', 'tensorboard_logs'),
                                              'histogram_freq': params.get('HISTOGRAM_FREQ', 0),
                                              'batch_size': params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']),
                                              'write_graph': params.get('WRITE_GRAPH', True),
                                              'write_grads': params.get('WRITE_GRADS', False),
                                              'write_images': params.get('WRITE_IMAGES', False),
                                              'embeddings_freq': params.get('EMBEDDINGS_FREQ', 0),
                                              'embeddings_layer_names': params.get('EMBEDDINGS_LAYER_NAMES', None),
                                              'embeddings_metadata': params.get('EMBEDDINGS_METADATA', None),
                                              'label_word_embeddings_with_vocab': params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False),
                                              'word_embeddings_labels': params.get('WORD_EMBEDDINGS_LABELS', None),
                                              }
                       }
    video_model.trainNet(dataset, training_params)

    total_end_time = timer()
    time_difference = total_end_time - total_start_time
    logging.info('In total is {0:.2f}s = {1:.2f}m'.format(time_difference, time_difference / 60.0))
예제 #6
0
파일: main.py 프로젝트: lvapeab/TMA
def train_model(params):
    """
    Training function. Sets the training parameters from params. Build or loads the model and launches the training.
    :param params: Dictionary of network hyperparameters.
    :return: None
    """

    if params['RELOAD'] > 0:
        logging.info('Resuming training.')

    check_params(params)

    ########### Load data
    dataset = build_dataset(params)
    if not '-vidtext-embed' in params['DATASET_NAME']:
        params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
            params['OUTPUTS_IDS_DATASET'][0]]
    else:
        params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
            params['INPUTS_IDS_DATASET'][1]]
    ###########

    ########### Build model

    if params['MODE'] == 'finetuning':
        # video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'], params['RELOAD'])
        video_model = VideoDesc_Model(params,
                                      type=params['MODEL_TYPE'],
                                      verbose=params['VERBOSE'],
                                      model_name=params['MODEL_NAME'] +
                                      '_reloaded',
                                      vocabularies=dataset.vocabulary,
                                      store_path=params['STORE_PATH'],
                                      set_optimizer=False,
                                      clear_dirs=False)
        video_model = updateModel(video_model,
                                  params['RELOAD_PATH'],
                                  params['RELOAD'],
                                  reload_epoch=False)
        video_model.setParams(params)

        # Define the inputs and outputs mapping from our Dataset instance to our model
        inputMapping = dict()
        for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
            if len(video_model.ids_inputs) > i:
                pos_source = dataset.ids_inputs.index(id_in)
                id_dest = video_model.ids_inputs[i]
                inputMapping[id_dest] = pos_source
        video_model.setInputsMapping(inputMapping)

        outputMapping = dict()
        for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
            if len(video_model.ids_outputs) > i:
                pos_target = dataset.ids_outputs.index(id_out)
                id_dest = video_model.ids_outputs[i]
                outputMapping[id_dest] = pos_target
        video_model.setOutputsMapping(outputMapping)

        video_model.setOptimizer()
        params['MAX_EPOCH'] += params['RELOAD']

    else:
        if params['RELOAD'] == 0 or params[
                'LOAD_WEIGHTS_ONLY']:  # build new model
            video_model = VideoDesc_Model(params,
                                          type=params['MODEL_TYPE'],
                                          verbose=params['VERBOSE'],
                                          model_name=params['MODEL_NAME'],
                                          vocabularies=dataset.vocabulary,
                                          store_path=params['STORE_PATH'],
                                          set_optimizer=True)
            dict2pkl(params, params['STORE_PATH'] + '/config')

            # Define the inputs and outputs mapping from our Dataset instance to our model
            inputMapping = dict()
            for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
                if len(video_model.ids_inputs) > i:
                    pos_source = dataset.ids_inputs.index(id_in)
                    id_dest = video_model.ids_inputs[i]
                    inputMapping[id_dest] = pos_source
            video_model.setInputsMapping(inputMapping)

            outputMapping = dict()
            for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
                if len(video_model.ids_outputs) > i:
                    pos_target = dataset.ids_outputs.index(id_out)
                    id_dest = video_model.ids_outputs[i]
                    outputMapping[id_dest] = pos_target
            video_model.setOutputsMapping(outputMapping)

            # Only load weights from pre-trained model
            if params['LOAD_WEIGHTS_ONLY'] and params['RELOAD'] > 0:
                for i in range(0, len(params['RELOAD'])):
                    old_model = loadModel(
                        params['PRE_TRAINED_MODEL_STORE_PATHS'][i],
                        params['RELOAD'][i])
                    video_model = transferWeights(old_model, video_model,
                                                  params['LAYERS_MAPPING'][i])
                video_model.setOptimizer()
                params['RELOAD'] = 0
        else:  # resume from previously trained model
            video_model = loadModel(params['PRE_TRAINED_MODEL_STORE_PATHS'],
                                    params['RELOAD'])
            video_model.params['LR'] = params['LR']
            video_model.setOptimizer()

            if video_model.model_path != params['STORE_PATH']:
                video_model.setName(params['MODEL_NAME'],
                                    models_path=params['STORE_PATH'],
                                    clear_dirs=False)
    # Update optimizer either if we are loading or building a model
    video_model.params = params
    video_model.setOptimizer()
    ###########

    ########### Test model saving/loading functions
    # saveModel(video_model, params['RELOAD'])
    # video_model = loadModel(params['STORE_PATH'], params['RELOAD'])
    ###########

    ########### Callbacks
    callbacks = buildCallbacks(params, video_model, dataset)
    ###########

    ########### Training
    total_start_time = timer()

    logger.debug('Starting training!')
    training_params = {
        'n_epochs': params['MAX_EPOCH'],
        'batch_size': params['BATCH_SIZE'],
        'homogeneous_batches': params['HOMOGENEOUS_BATCHES'],
        'maxlen': params['MAX_OUTPUT_TEXT_LEN'],
        'lr_decay': params['LR_DECAY'],
        'lr_gamma': params['LR_GAMMA'],
        'epochs_for_save': params['EPOCHS_FOR_SAVE'],
        'verbose': params['VERBOSE'],
        'eval_on_sets': params['EVAL_ON_SETS_KERAS'],
        'n_parallel_loaders': params['PARALLEL_LOADERS'],
        'extra_callbacks': callbacks,
        'reload_epoch': params['RELOAD'],
        'epoch_offset': params['RELOAD'],
        'data_augmentation': params['DATA_AUGMENTATION'],
        'patience': params.get('PATIENCE', 0),  # early stopping parameters
        'metric_check': params.get('STOP_METRIC', None),
        'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True),
        'each_n_epochs': params.get('EVAL_EACH', 1),
        'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0)
    }

    video_model.trainNet(dataset, training_params)

    total_end_time = timer()
    time_difference = total_end_time - total_start_time
    logging.info('In total is {0:.2f}s = {1:.2f}m'.format(
        time_difference, time_difference / 60.0))