예제 #1
0
def main():
    args = parse_args()
    server_address = (args.address, args.port)
    httpd = HTTPServer(server_address, NMTHandler)
    logger.setLevel(args.logging_level)
    parameters = load_parameters()
    if args.config is not None:
        logger.info("Loading parameters from %s" % str(args.config))
        parameters = update_parameters(parameters, pkl2dict(args.config))

    if args.online:
        online_parameters = load_parameters_online()
        parameters = update_parameters(parameters, online_parameters)

    try:
        for arg in args.changes:
            try:
                k, v = arg.split('=')
            except ValueError:
                print(
                    'Overwritten arguments must have the form key=Value. \n Currently are: %s'
                    % str(args.changes))
                exit(1)
            try:
                parameters[k] = ast.literal_eval(v)
            except ValueError:
                parameters[k] = v
    except ValueError:
        print('Error processing arguments: (', k, ",", v, ")")
        exit(2)
    dataset = loadDataset(args.dataset)

    # For converting predictions into sentences
    # Dataset backwards compatibility
    bpe_separator = dataset.BPE_separator if hasattr(
        dataset,
        "BPE_separator") and dataset.BPE_separator is not None else '@@'
    # Build BPE tokenizer if necessary
    if 'bpe' in parameters['TOKENIZATION_METHOD'].lower():
        logger.info('Building BPE')
        if not dataset.BPE_built:
            dataset.build_bpe(parameters.get(
                'BPE_CODES_PATH',
                parameters['DATA_ROOT_PATH'] + '/training_codes.joint'),
                              separator=bpe_separator)
    # Build tokenization function
    tokenize_f = eval('dataset.' +
                      parameters.get('TOKENIZATION_METHOD', 'tokenize_bpe'))
    detokenize_function = eval(
        'dataset.' + parameters.get('DETOKENIZATION_METHOD', 'detokenize_bpe'))
    dataset.build_moses_tokenizer(language=parameters['SRC_LAN'])
    dataset.build_moses_detokenizer(language=parameters['TRG_LAN'])
    tokenize_general = dataset.tokenize_moses
    detokenize_general = dataset.detokenize_moses

    # Prediction parameters
    params_prediction = dict()
    params_prediction['max_batch_size'] = parameters.get('BATCH_SIZE', 20)
    params_prediction['n_parallel_loaders'] = parameters.get(
        'PARALLEL_LOADERS', 1)
    params_prediction['beam_size'] = parameters.get('BEAM_SIZE', 6)
    params_prediction['maxlen'] = parameters.get('MAX_OUTPUT_TEXT_LEN_TEST',
                                                 100)
    params_prediction['optimized_search'] = parameters['OPTIMIZED_SEARCH']
    params_prediction['model_inputs'] = parameters['INPUTS_IDS_MODEL']
    params_prediction['model_outputs'] = parameters['OUTPUTS_IDS_MODEL']
    params_prediction['dataset_inputs'] = parameters['INPUTS_IDS_DATASET']
    params_prediction['dataset_outputs'] = parameters['OUTPUTS_IDS_DATASET']
    params_prediction['search_pruning'] = parameters.get(
        'SEARCH_PRUNING', False)
    params_prediction['normalize_probs'] = True
    params_prediction['alpha_factor'] = parameters.get('ALPHA_FACTOR', 1.0)
    params_prediction['coverage_penalty'] = True
    params_prediction['length_penalty'] = True
    params_prediction['length_norm_factor'] = parameters.get(
        'LENGTH_NORM_FACTOR', 0.0)
    params_prediction['coverage_norm_factor'] = parameters.get(
        'COVERAGE_NORM_FACTOR', 0.0)
    params_prediction['pos_unk'] = parameters.get('POS_UNK', False)
    params_prediction['heuristic'] = parameters.get('HEURISTIC', 0)
    params_prediction['state_below_index'] = -1
    params_prediction['output_text_index'] = 0
    params_prediction['state_below_maxlen'] = -1 if parameters.get(
        'PAD_ON_BATCH', True) else parameters.get('MAX_OUTPUT_TEXT_LEN', 50)
    params_prediction['output_max_length_depending_on_x'] = parameters.get(
        'MAXLEN_GIVEN_X', True)
    params_prediction[
        'output_max_length_depending_on_x_factor'] = parameters.get(
            'MAXLEN_GIVEN_X_FACTOR', 3)
    params_prediction['output_min_length_depending_on_x'] = parameters.get(
        'MINLEN_GIVEN_X', True)
    params_prediction[
        'output_min_length_depending_on_x_factor'] = parameters.get(
            'MINLEN_GIVEN_X_FACTOR', 2)
    params_prediction['attend_on_output'] = parameters.get(
        'ATTEND_ON_OUTPUT', 'transformer' in parameters['MODEL_TYPE'].lower())

    # Manage pos_unk strategies
    if parameters['POS_UNK']:
        mapping = None if dataset.mapping == dict() else dataset.mapping
    else:
        mapping = None

    if 'transformer' in parameters['MODEL_TYPE'].lower():
        params_prediction['pos_unk'] = False
        params_prediction['coverage_penalty'] = False

    # Training parameters
    parameters_training = dict()
    if args.online:
        logger.info('Loading models from %s' % str(args.models))
        parameters_training = {  # Traning parameters
            'n_epochs': parameters['MAX_EPOCH'],
            'shuffle': False,
            'loss': parameters.get('LOSS', 'categorical_crossentropy'),
            'batch_size': parameters.get('BATCH_SIZE', 1),
            'homogeneous_batches': False,
            'optimizer': parameters.get('OPTIMIZER', 'SGD'),
            'lr': parameters.get('LR', 0.1),
            'lr_decay': parameters.get('LR_DECAY', None),
            'lr_gamma': parameters.get('LR_GAMMA', 1.),
            'epochs_for_save': -1,
            'verbose': args.verbose,
            'eval_on_sets': parameters.get('EVAL_ON_SETS_KERAS', None),
            'n_parallel_loaders': parameters['PARALLEL_LOADERS'],
            'extra_callbacks': [],  # callbacks,
            'reload_epoch': parameters['RELOAD'],
            'epoch_offset': parameters['RELOAD'],
            'data_augmentation': parameters['DATA_AUGMENTATION'],
            'patience': parameters.get('PATIENCE', 0),
            'metric_check': parameters.get('STOP_METRIC', None),
            'eval_on_epochs': parameters.get('EVAL_EACH_EPOCHS', True),
            'each_n_epochs': parameters.get('EVAL_EACH', 1),
            'start_eval_on_epoch': parameters.get('START_EVAL_ON_EPOCH', 0),
            'additional_training_settings': {
                'k': parameters.get('K', 1),
                'tau': parameters.get('TAU', 1),
                'lambda': parameters.get('LAMBDA', 0.5),
                'c': parameters.get('C', 0.5),
                'd': parameters.get('D', 0.5)
            }
        }
        model_instances = [
            TranslationModel(
                parameters,
                model_type=parameters['MODEL_TYPE'],
                verbose=parameters['VERBOSE'],
                model_name=parameters['MODEL_NAME'] + '_' + str(i),
                vocabularies=dataset.vocabulary,
                store_path=parameters['STORE_PATH'],
                set_optimizer=False) for i in range(len(args.models))
        ]
        models = [
            updateModel(model, path, -1, full_path=True)
            for (model, path) in zip(model_instances, args.models)
        ]
    else:
        models = [loadModel(m, -1, full_path=True) for m in args.models]

    for nmt_model in models:
        nmt_model.setParams(parameters)
        nmt_model.setOptimizer()

    parameters['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        parameters['INPUTS_IDS_DATASET'][0]]
    parameters['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        parameters['OUTPUTS_IDS_DATASET'][0]]

    # Get word2index and index2word dictionaries
    index2word_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET']
                                      [0]]['idx2words']
    word2index_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET']
                                      [0]]['words2idx']
    index2word_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET']
                                      [0]]['idx2words']
    word2index_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET']
                                      [0]]['words2idx']

    excluded_words = None
    interactive_beam_searcher = NMTSampler(models,
                                           dataset,
                                           parameters,
                                           params_prediction,
                                           parameters_training,
                                           tokenize_f,
                                           detokenize_function,
                                           tokenize_general,
                                           detokenize_general,
                                           mapping=mapping,
                                           word2index_x=word2index_x,
                                           word2index_y=word2index_y,
                                           index2word_y=index2word_y,
                                           eos_symbol=args.eos_symbol,
                                           excluded_words=excluded_words,
                                           online=args.online,
                                           verbose=args.verbose)

    httpd.sampler = interactive_beam_searcher

    logger.info('Server starting at %s' % str(server_address))
    httpd.serve_forever()
예제 #2
0
def train_model(params, load_dataset=None):
    """
    Training function.

    Sets the training parameters from params.

    Build or loads the model and launches the training.

    :param dict params: Dictionary of network hyperparameters.
    :param str load_dataset: Load dataset from file or build it from the parameters.
    :return: None
    """

    if params['RELOAD'] > 0:
        logger.info('Resuming training.')
        # Load data
        if load_dataset is None:
            if params['REBUILD_DATASET']:
                logger.info('Rebuilding dataset.')
                dataset = build_dataset(params)
            else:
                logger.info('Updating dataset.')
                dataset = loadDataset(params['DATASET_STORE_PATH'] +
                                      '/Dataset_' + params['DATASET_NAME'] +
                                      '_' + params['SRC_LAN'] +
                                      params['TRG_LAN'] + '.pkl')

                epoch_offset = 0 if dataset.len_train == 0 else int(
                    params['RELOAD'] * params['BATCH_SIZE'] /
                    dataset.len_train)
                params['EPOCH_OFFSET'] = params['RELOAD'] if params[
                    'RELOAD_EPOCH'] else epoch_offset

                for split, filename in iteritems(params['TEXT_FILES']):
                    dataset = update_dataset_from_file(
                        dataset,
                        params['DATA_ROOT_PATH'] + '/' + filename +
                        params['SRC_LAN'],
                        params,
                        splits=list([split]),
                        output_text_filename=params['DATA_ROOT_PATH'] + '/' +
                        filename + params['TRG_LAN'],
                        remove_outputs=False,
                        compute_state_below=True,
                        recompute_references=True)
                    dataset.name = params['DATASET_NAME'] + '_' + params[
                        'SRC_LAN'] + params['TRG_LAN']
                saveDataset(dataset, params['DATASET_STORE_PATH'])

        else:
            logger.info('Reloading and using dataset.')
            dataset = loadDataset(load_dataset)
    else:
        # Load data
        if load_dataset is None:
            dataset = build_dataset(params)
        else:
            dataset = loadDataset(load_dataset)

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]

    # Build model
    set_optimizer = True if params['RELOAD'] == 0 else False
    clear_dirs = True if params['RELOAD'] == 0 else False

    # build new model
    nmt_model = TranslationModel(params,
                                 model_type=params['MODEL_TYPE'],
                                 verbose=params['VERBOSE'],
                                 model_name=params['MODEL_NAME'],
                                 vocabularies=dataset.vocabulary,
                                 store_path=params['STORE_PATH'],
                                 set_optimizer=set_optimizer,
                                 clear_dirs=clear_dirs)

    # Define the inputs and outputs mapping from our Dataset instance to our model
    inputMapping = dict()
    for i, id_in in enumerate(params['INPUTS_IDS_DATASET']):
        pos_source = dataset.ids_inputs.index(id_in)
        id_dest = nmt_model.ids_inputs[i]
        inputMapping[id_dest] = pos_source
    nmt_model.setInputsMapping(inputMapping)

    outputMapping = dict()
    for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']):
        pos_target = dataset.ids_outputs.index(id_out)
        id_dest = nmt_model.ids_outputs[i]
        outputMapping[id_dest] = pos_target
    nmt_model.setOutputsMapping(outputMapping)

    if params['RELOAD'] > 0:
        nmt_model = updateModel(nmt_model,
                                params['STORE_PATH'],
                                params['RELOAD'],
                                reload_epoch=params['RELOAD_EPOCH'])
        nmt_model.setParams(params)
        nmt_model.setOptimizer()
        if params.get('EPOCH_OFFSET') is None:
            params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \
                int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train)

    # Store configuration as pkl
    dict2pkl(params, params['STORE_PATH'] + '/config')

    # Callbacks
    callbacks = buildCallbacks(params, nmt_model, dataset)

    # Training
    total_start_time = timer()

    logger.debug('Starting training!')
    training_params = {
        'n_epochs':
        params['MAX_EPOCH'],
        'batch_size':
        params['BATCH_SIZE'],
        'homogeneous_batches':
        params['HOMOGENEOUS_BATCHES'],
        'maxlen':
        params['MAX_OUTPUT_TEXT_LEN'],
        'joint_batches':
        params['JOINT_BATCHES'],
        'lr_decay':
        params.get('LR_DECAY', None),  # LR decay parameters
        'initial_lr':
        params.get('LR', 1.0),
        'reduce_each_epochs':
        params.get('LR_REDUCE_EACH_EPOCHS', True),
        'start_reduction_on_epoch':
        params.get('LR_START_REDUCTION_ON_EPOCH', 0),
        'lr_gamma':
        params.get('LR_GAMMA', 0.9),
        'lr_reducer_type':
        params.get('LR_REDUCER_TYPE', 'linear'),
        'lr_reducer_exp_base':
        params.get('LR_REDUCER_EXP_BASE', 0),
        'lr_half_life':
        params.get('LR_HALF_LIFE', 50000),
        'lr_warmup_exp':
        params.get('WARMUP_EXP', -1.5),
        'min_lr':
        params.get('MIN_LR', 1e-9),
        'epochs_for_save':
        params['EPOCHS_FOR_SAVE'],
        'verbose':
        params['VERBOSE'],
        'eval_on_sets':
        params['EVAL_ON_SETS_KERAS'],
        'n_parallel_loaders':
        params['PARALLEL_LOADERS'],
        'extra_callbacks':
        callbacks,
        'reload_epoch':
        params['RELOAD'],
        'epoch_offset':
        params.get('EPOCH_OFFSET', 0),
        'data_augmentation':
        params['DATA_AUGMENTATION'],
        'patience':
        params.get('PATIENCE', 0),  # early stopping parameters
        'metric_check':
        params.get('STOP_METRIC', None)
        if params.get('EARLY_STOP', False) else None,
        'eval_on_epochs':
        params.get('EVAL_EACH_EPOCHS', True),
        'each_n_epochs':
        params.get('EVAL_EACH', 1),
        'start_eval_on_epoch':
        params.get('START_EVAL_ON_EPOCH', 0),
        'tensorboard':
        params.get('TENSORBOARD', False),
        'n_gpus':
        params.get('N_GPUS', 1),
        'tensorboard_params': {
            'log_dir':
            params.get('LOG_DIR', 'tensorboard_logs'),
            'histogram_freq':
            params.get('HISTOGRAM_FREQ', 0),
            'batch_size':
            params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']),
            'write_graph':
            params.get('WRITE_GRAPH', True),
            'write_grads':
            params.get('WRITE_GRADS', False),
            'write_images':
            params.get('WRITE_IMAGES', False),
            'embeddings_freq':
            params.get('EMBEDDINGS_FREQ', 0),
            'embeddings_layer_names':
            params.get('EMBEDDINGS_LAYER_NAMES', None),
            'embeddings_metadata':
            params.get('EMBEDDINGS_METADATA', None),
            'label_word_embeddings_with_vocab':
            params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False),
            'word_embeddings_labels':
            params.get('WORD_EMBEDDINGS_LABELS', None),
        }
    }
    nmt_model.trainNet(dataset, training_params)

    total_end_time = timer()
    time_difference = total_end_time - total_start_time
    logger.info('In total is {0:.2f}s = {1:.2f}m'.format(
        time_difference, time_difference / 60.0))
            'd': params.get('D', 0.5)
        }
    }

    params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['INPUTS_IDS_DATASET'][0]]
    params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[
        params['OUTPUTS_IDS_DATASET'][0]]
    logger.info("<<< Using an ensemble of %d models >>>" % len(args.models))
    # Load trainable model(s)
    logging.info('Loading models from %s' % str(args.models))
    model_instances = [
        TranslationModel(params,
                         model_type=params['MODEL_TYPE'],
                         verbose=params['VERBOSE'],
                         model_name=params['MODEL_NAME'] + '_' + str(i),
                         vocabularies=dataset.vocabulary,
                         store_path=params['STORE_PATH'],
                         clear_dirs=False,
                         set_optimizer=False) for i in range(len(args.models))
    ]
    models = [
        updateModel(model, path, -1, full_path=True)
        for (model, path) in zip(model_instances, args.models)
    ]

    # Set additional inputs to models if using a custom loss function
    params['USE_CUSTOM_LOSS'] = True if 'PAS' in params['OPTIMIZER'] else False
    if params.get('N_BEST_OPTIMIZER', False):
        logging.info('Using N-best optimizer')

    models = build_online_models(models, params)
예제 #4
0
src_model = loadModel(SRC_MODEL_PATH, epoch_choice)
params = src_model.params
params['USE_CUDNN'] = False
# params['BIDIRECTIONAL_ENCODER'] = False  # Set to False to get the RNN type displayed.
params['MODEL_NAME'] = 'CPU'
params['STORE_PATH'] = DST_MODEL_PATH
params['MODE'] = 'sampling'
params['RELOAD'] = epoch_choice
# params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]]
# params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]]

cpu_model = TranslationModel(params,
                             model_type=params['MODEL_TYPE'],
                             verbose=True,
                             model_name=params['MODEL_NAME'],
                             vocabularies=dataset.vocabulary,
                             store_path=params['STORE_PATH'],
                             set_optimizer=True,
                             clear_dirs=True)
exit()

cpu_model = updateModel(cpu_model,
                        SRC_MODEL_PATH,
                        params['RELOAD'],
                        reload_epoch=True)

saveModel(cpu_model,
          update_num=epoch_choice,
          path=DST_MODEL_PATH,
          full_path=True)