def main(): args = parse_args() server_address = (args.address, args.port) httpd = HTTPServer(server_address, NMTHandler) logger.setLevel(args.logging_level) parameters = load_parameters() if args.config is not None: logger.info("Loading parameters from %s" % str(args.config)) parameters = update_parameters(parameters, pkl2dict(args.config)) if args.online: online_parameters = load_parameters_online() parameters = update_parameters(parameters, online_parameters) try: for arg in args.changes: try: k, v = arg.split('=') except ValueError: print( 'Overwritten arguments must have the form key=Value. \n Currently are: %s' % str(args.changes)) exit(1) try: parameters[k] = ast.literal_eval(v) except ValueError: parameters[k] = v except ValueError: print('Error processing arguments: (', k, ",", v, ")") exit(2) dataset = loadDataset(args.dataset) # For converting predictions into sentences # Dataset backwards compatibility bpe_separator = dataset.BPE_separator if hasattr( dataset, "BPE_separator") and dataset.BPE_separator is not None else '@@' # Build BPE tokenizer if necessary if 'bpe' in parameters['TOKENIZATION_METHOD'].lower(): logger.info('Building BPE') if not dataset.BPE_built: dataset.build_bpe(parameters.get( 'BPE_CODES_PATH', parameters['DATA_ROOT_PATH'] + '/training_codes.joint'), separator=bpe_separator) # Build tokenization function tokenize_f = eval('dataset.' + parameters.get('TOKENIZATION_METHOD', 'tokenize_bpe')) detokenize_function = eval( 'dataset.' + parameters.get('DETOKENIZATION_METHOD', 'detokenize_bpe')) dataset.build_moses_tokenizer(language=parameters['SRC_LAN']) dataset.build_moses_detokenizer(language=parameters['TRG_LAN']) tokenize_general = dataset.tokenize_moses detokenize_general = dataset.detokenize_moses # Prediction parameters params_prediction = dict() params_prediction['max_batch_size'] = parameters.get('BATCH_SIZE', 20) params_prediction['n_parallel_loaders'] = parameters.get( 'PARALLEL_LOADERS', 1) params_prediction['beam_size'] = parameters.get('BEAM_SIZE', 6) params_prediction['maxlen'] = parameters.get('MAX_OUTPUT_TEXT_LEN_TEST', 100) params_prediction['optimized_search'] = parameters['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = parameters['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = parameters['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = parameters['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = parameters['OUTPUTS_IDS_DATASET'] params_prediction['search_pruning'] = parameters.get( 'SEARCH_PRUNING', False) params_prediction['normalize_probs'] = True params_prediction['alpha_factor'] = parameters.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = True params_prediction['length_penalty'] = True params_prediction['length_norm_factor'] = parameters.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = parameters.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = parameters.get('POS_UNK', False) params_prediction['heuristic'] = parameters.get('HEURISTIC', 0) params_prediction['state_below_index'] = -1 params_prediction['output_text_index'] = 0 params_prediction['state_below_maxlen'] = -1 if parameters.get( 'PAD_ON_BATCH', True) else parameters.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = parameters.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = parameters.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = parameters.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = parameters.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = parameters.get( 'ATTEND_ON_OUTPUT', 'transformer' in parameters['MODEL_TYPE'].lower()) # Manage pos_unk strategies if parameters['POS_UNK']: mapping = None if dataset.mapping == dict() else dataset.mapping else: mapping = None if 'transformer' in parameters['MODEL_TYPE'].lower(): params_prediction['pos_unk'] = False params_prediction['coverage_penalty'] = False # Training parameters parameters_training = dict() if args.online: logger.info('Loading models from %s' % str(args.models)) parameters_training = { # Traning parameters 'n_epochs': parameters['MAX_EPOCH'], 'shuffle': False, 'loss': parameters.get('LOSS', 'categorical_crossentropy'), 'batch_size': parameters.get('BATCH_SIZE', 1), 'homogeneous_batches': False, 'optimizer': parameters.get('OPTIMIZER', 'SGD'), 'lr': parameters.get('LR', 0.1), 'lr_decay': parameters.get('LR_DECAY', None), 'lr_gamma': parameters.get('LR_GAMMA', 1.), 'epochs_for_save': -1, 'verbose': args.verbose, 'eval_on_sets': parameters.get('EVAL_ON_SETS_KERAS', None), 'n_parallel_loaders': parameters['PARALLEL_LOADERS'], 'extra_callbacks': [], # callbacks, 'reload_epoch': parameters['RELOAD'], 'epoch_offset': parameters['RELOAD'], 'data_augmentation': parameters['DATA_AUGMENTATION'], 'patience': parameters.get('PATIENCE', 0), 'metric_check': parameters.get('STOP_METRIC', None), 'eval_on_epochs': parameters.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': parameters.get('EVAL_EACH', 1), 'start_eval_on_epoch': parameters.get('START_EVAL_ON_EPOCH', 0), 'additional_training_settings': { 'k': parameters.get('K', 1), 'tau': parameters.get('TAU', 1), 'lambda': parameters.get('LAMBDA', 0.5), 'c': parameters.get('C', 0.5), 'd': parameters.get('D', 0.5) } } model_instances = [ TranslationModel( parameters, model_type=parameters['MODEL_TYPE'], verbose=parameters['VERBOSE'], model_name=parameters['MODEL_NAME'] + '_' + str(i), vocabularies=dataset.vocabulary, store_path=parameters['STORE_PATH'], set_optimizer=False) for i in range(len(args.models)) ] models = [ updateModel(model, path, -1, full_path=True) for (model, path) in zip(model_instances, args.models) ] else: models = [loadModel(m, -1, full_path=True) for m in args.models] for nmt_model in models: nmt_model.setParams(parameters) nmt_model.setOptimizer() parameters['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ parameters['INPUTS_IDS_DATASET'][0]] parameters['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ parameters['OUTPUTS_IDS_DATASET'][0]] # Get word2index and index2word dictionaries index2word_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET'] [0]]['idx2words'] word2index_y = dataset.vocabulary[parameters['OUTPUTS_IDS_DATASET'] [0]]['words2idx'] index2word_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET'] [0]]['idx2words'] word2index_x = dataset.vocabulary[parameters['INPUTS_IDS_DATASET'] [0]]['words2idx'] excluded_words = None interactive_beam_searcher = NMTSampler(models, dataset, parameters, params_prediction, parameters_training, tokenize_f, detokenize_function, tokenize_general, detokenize_general, mapping=mapping, word2index_x=word2index_x, word2index_y=word2index_y, index2word_y=index2word_y, eos_symbol=args.eos_symbol, excluded_words=excluded_words, online=args.online, verbose=args.verbose) httpd.sampler = interactive_beam_searcher logger.info('Server starting at %s' % str(server_address)) httpd.serve_forever()
def train_model(params, load_dataset=None): """ Training function. Sets the training parameters from params. Build or loads the model and launches the training. :param dict params: Dictionary of network hyperparameters. :param str load_dataset: Load dataset from file or build it from the parameters. :return: None """ if params['RELOAD'] > 0: logger.info('Resuming training.') # Load data if load_dataset is None: if params['REBUILD_DATASET']: logger.info('Rebuilding dataset.') dataset = build_dataset(params) else: logger.info('Updating dataset.') dataset = loadDataset(params['DATASET_STORE_PATH'] + '/Dataset_' + params['DATASET_NAME'] + '_' + params['SRC_LAN'] + params['TRG_LAN'] + '.pkl') epoch_offset = 0 if dataset.len_train == 0 else int( params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) params['EPOCH_OFFSET'] = params['RELOAD'] if params[ 'RELOAD_EPOCH'] else epoch_offset for split, filename in iteritems(params['TEXT_FILES']): dataset = update_dataset_from_file( dataset, params['DATA_ROOT_PATH'] + '/' + filename + params['SRC_LAN'], params, splits=list([split]), output_text_filename=params['DATA_ROOT_PATH'] + '/' + filename + params['TRG_LAN'], remove_outputs=False, compute_state_below=True, recompute_references=True) dataset.name = params['DATASET_NAME'] + '_' + params[ 'SRC_LAN'] + params['TRG_LAN'] saveDataset(dataset, params['DATASET_STORE_PATH']) else: logger.info('Reloading and using dataset.') dataset = loadDataset(load_dataset) else: # Load data if load_dataset is None: dataset = build_dataset(params) else: dataset = loadDataset(load_dataset) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Build model set_optimizer = True if params['RELOAD'] == 0 else False clear_dirs = True if params['RELOAD'] == 0 else False # build new model nmt_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=set_optimizer, clear_dirs=clear_dirs) # Define the inputs and outputs mapping from our Dataset instance to our model inputMapping = dict() for i, id_in in enumerate(params['INPUTS_IDS_DATASET']): pos_source = dataset.ids_inputs.index(id_in) id_dest = nmt_model.ids_inputs[i] inputMapping[id_dest] = pos_source nmt_model.setInputsMapping(inputMapping) outputMapping = dict() for i, id_out in enumerate(params['OUTPUTS_IDS_DATASET']): pos_target = dataset.ids_outputs.index(id_out) id_dest = nmt_model.ids_outputs[i] outputMapping[id_dest] = pos_target nmt_model.setOutputsMapping(outputMapping) if params['RELOAD'] > 0: nmt_model = updateModel(nmt_model, params['STORE_PATH'], params['RELOAD'], reload_epoch=params['RELOAD_EPOCH']) nmt_model.setParams(params) nmt_model.setOptimizer() if params.get('EPOCH_OFFSET') is None: params['EPOCH_OFFSET'] = params['RELOAD'] if params['RELOAD_EPOCH'] else \ int(params['RELOAD'] * params['BATCH_SIZE'] / dataset.len_train) # Store configuration as pkl dict2pkl(params, params['STORE_PATH'] + '/config') # Callbacks callbacks = buildCallbacks(params, nmt_model, dataset) # Training total_start_time = timer() logger.debug('Starting training!') training_params = { 'n_epochs': params['MAX_EPOCH'], 'batch_size': params['BATCH_SIZE'], 'homogeneous_batches': params['HOMOGENEOUS_BATCHES'], 'maxlen': params['MAX_OUTPUT_TEXT_LEN'], 'joint_batches': params['JOINT_BATCHES'], 'lr_decay': params.get('LR_DECAY', None), # LR decay parameters 'initial_lr': params.get('LR', 1.0), 'reduce_each_epochs': params.get('LR_REDUCE_EACH_EPOCHS', True), 'start_reduction_on_epoch': params.get('LR_START_REDUCTION_ON_EPOCH', 0), 'lr_gamma': params.get('LR_GAMMA', 0.9), 'lr_reducer_type': params.get('LR_REDUCER_TYPE', 'linear'), 'lr_reducer_exp_base': params.get('LR_REDUCER_EXP_BASE', 0), 'lr_half_life': params.get('LR_HALF_LIFE', 50000), 'lr_warmup_exp': params.get('WARMUP_EXP', -1.5), 'min_lr': params.get('MIN_LR', 1e-9), 'epochs_for_save': params['EPOCHS_FOR_SAVE'], 'verbose': params['VERBOSE'], 'eval_on_sets': params['EVAL_ON_SETS_KERAS'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'extra_callbacks': callbacks, 'reload_epoch': params['RELOAD'], 'epoch_offset': params.get('EPOCH_OFFSET', 0), 'data_augmentation': params['DATA_AUGMENTATION'], 'patience': params.get('PATIENCE', 0), # early stopping parameters 'metric_check': params.get('STOP_METRIC', None) if params.get('EARLY_STOP', False) else None, 'eval_on_epochs': params.get('EVAL_EACH_EPOCHS', True), 'each_n_epochs': params.get('EVAL_EACH', 1), 'start_eval_on_epoch': params.get('START_EVAL_ON_EPOCH', 0), 'tensorboard': params.get('TENSORBOARD', False), 'n_gpus': params.get('N_GPUS', 1), 'tensorboard_params': { 'log_dir': params.get('LOG_DIR', 'tensorboard_logs'), 'histogram_freq': params.get('HISTOGRAM_FREQ', 0), 'batch_size': params.get('TENSORBOARD_BATCH_SIZE', params['BATCH_SIZE']), 'write_graph': params.get('WRITE_GRAPH', True), 'write_grads': params.get('WRITE_GRADS', False), 'write_images': params.get('WRITE_IMAGES', False), 'embeddings_freq': params.get('EMBEDDINGS_FREQ', 0), 'embeddings_layer_names': params.get('EMBEDDINGS_LAYER_NAMES', None), 'embeddings_metadata': params.get('EMBEDDINGS_METADATA', None), 'label_word_embeddings_with_vocab': params.get('LABEL_WORD_EMBEDDINGS_WITH_VOCAB', False), 'word_embeddings_labels': params.get('WORD_EMBEDDINGS_LABELS', None), } } nmt_model.trainNet(dataset, training_params) total_end_time = timer() time_difference = total_end_time - total_start_time logger.info('In total is {0:.2f}s = {1:.2f}m'.format( time_difference, time_difference / 60.0))
'd': params.get('D', 0.5) } } params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] logger.info("<<< Using an ensemble of %d models >>>" % len(args.models)) # Load trainable model(s) logging.info('Loading models from %s' % str(args.models)) model_instances = [ TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=params['VERBOSE'], model_name=params['MODEL_NAME'] + '_' + str(i), vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], clear_dirs=False, set_optimizer=False) for i in range(len(args.models)) ] models = [ updateModel(model, path, -1, full_path=True) for (model, path) in zip(model_instances, args.models) ] # Set additional inputs to models if using a custom loss function params['USE_CUSTOM_LOSS'] = True if 'PAS' in params['OPTIMIZER'] else False if params.get('N_BEST_OPTIMIZER', False): logging.info('Using N-best optimizer') models = build_online_models(models, params)
src_model = loadModel(SRC_MODEL_PATH, epoch_choice) params = src_model.params params['USE_CUDNN'] = False # params['BIDIRECTIONAL_ENCODER'] = False # Set to False to get the RNN type displayed. params['MODEL_NAME'] = 'CPU' params['STORE_PATH'] = DST_MODEL_PATH params['MODE'] = 'sampling' params['RELOAD'] = epoch_choice # params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] # params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] cpu_model = TranslationModel(params, model_type=params['MODEL_TYPE'], verbose=True, model_name=params['MODEL_NAME'], vocabularies=dataset.vocabulary, store_path=params['STORE_PATH'], set_optimizer=True, clear_dirs=True) exit() cpu_model = updateModel(cpu_model, SRC_MODEL_PATH, params['RELOAD'], reload_epoch=True) saveModel(cpu_model, update_num=epoch_choice, path=DST_MODEL_PATH, full_path=True)