def score_corpus(args, params): print "Using an ensemble of %d models" % len(args.models) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) if args.source is not None: dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) for s in args.splits: # Apply model predictions params_prediction = {'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s]} if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params['OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get('NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get('COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get('LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get('LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get('COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get('MAXLEN_GIVEN_X', True) params_prediction['output_max_length_depending_on_x_factor'] = params.get('MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get('MINLEN_GIVEN_X', True) params_prediction['output_min_length_depending_on_x_factor'] = params.get('MINLEN_GIVEN_X_FACTOR', 2) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print scores
def score_corpus(args, params): """ Use one or several translation models for scoring source--target pairs- :param argparse.Namespace args: Arguments given to the method: * dataset: Dataset instance with data. * source: Text file with source sentences. * target: Text file with target sentences. * splits: Splits to sample. Should be already included in the dataset object. * dest: Output file to save scores. * weights: Weight given to each model in the ensemble. You should provide the same number of weights than models. By default, it applies the same weight to each model (1/N). * verbose: Be verbose or not. * config: Config .pkl for loading the model configuration. If not specified, hyperparameters are read from config.py. * models: Path to the models. :param dict params: parameters of the translation model. """ from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.dataset import loadDataset from keras_wrapper.cnn_model import loadModel from keras_wrapper.model_ensemble import BeamSearchEnsemble logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(float, model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get( 'NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get( 'COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get( 'LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print(scores)
'MINLEN_GIVEN_X_FACTOR', 2) mapping = None if dataset.mapping == dict() else dataset.mapping if params['POS_UNK']: params_prediction['heuristic'] = params['HEURISTIC'] input_text_id = params['INPUTS_IDS_DATASET'][0] vocab_src = dataset.vocabulary[input_text_id]['idx2words'] else: input_text_id = None vocab_src = None mapping = None beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print scores
def score_corpus(args, params): from data_engine.prepare_data import update_dataset_from_file from keras_wrapper.dataset import loadDataset from keras_wrapper.cnn_model import loadModel from keras_wrapper.model_ensemble import BeamSearchEnsemble logging.info("Using an ensemble of %d models" % len(args.models)) models = [loadModel(m, -1, full_path=True) for m in args.models] dataset = loadDataset(args.dataset) dataset = update_dataset_from_file(dataset, args.source, params, splits=args.splits, output_text_filename=args.target, compute_state_below=True) params['INPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['INPUTS_IDS_DATASET'][0]] params['OUTPUT_VOCABULARY_SIZE'] = dataset.vocabulary_len[ params['OUTPUTS_IDS_DATASET'][0]] # Apply scoring extra_vars = dict() extra_vars['tokenize_f'] = eval('dataset.' + params['TOKENIZATION_METHOD']) model_weights = args.weights if model_weights is not None and model_weights != []: assert len(model_weights) == len( models ), 'You should give a weight to each model. You gave %d models and %d weights.' % ( len(models), len(model_weights)) model_weights = map(lambda x: float(x), model_weights) if len(model_weights) > 1: logger.info('Giving the following weights to each model: %s' % str(model_weights)) for s in args.splits: # Apply model predictions params_prediction = { 'max_batch_size': params['BATCH_SIZE'], 'n_parallel_loaders': params['PARALLEL_LOADERS'], 'predict_on_sets': [s] } if params['BEAM_SEARCH']: params_prediction['beam_size'] = params['BEAM_SIZE'] params_prediction['maxlen'] = params['MAX_OUTPUT_TEXT_LEN_TEST'] params_prediction['optimized_search'] = params['OPTIMIZED_SEARCH'] params_prediction['model_inputs'] = params['INPUTS_IDS_MODEL'] params_prediction['model_outputs'] = params['OUTPUTS_IDS_MODEL'] params_prediction['dataset_inputs'] = params['INPUTS_IDS_DATASET'] params_prediction['dataset_outputs'] = params[ 'OUTPUTS_IDS_DATASET'] params_prediction['normalize_probs'] = params.get( 'NORMALIZE_SAMPLING', False) params_prediction['alpha_factor'] = params.get('ALPHA_FACTOR', 1.0) params_prediction['coverage_penalty'] = params.get( 'COVERAGE_PENALTY', False) params_prediction['length_penalty'] = params.get( 'LENGTH_PENALTY', False) params_prediction['length_norm_factor'] = params.get( 'LENGTH_NORM_FACTOR', 0.0) params_prediction['coverage_norm_factor'] = params.get( 'COVERAGE_NORM_FACTOR', 0.0) params_prediction['pos_unk'] = params.get('POS_UNK', False) params_prediction['state_below_maxlen'] = -1 if params.get('PAD_ON_BATCH', True) \ else params.get('MAX_OUTPUT_TEXT_LEN', 50) params_prediction['output_max_length_depending_on_x'] = params.get( 'MAXLEN_GIVEN_X', True) params_prediction[ 'output_max_length_depending_on_x_factor'] = params.get( 'MAXLEN_GIVEN_X_FACTOR', 3) params_prediction['output_min_length_depending_on_x'] = params.get( 'MINLEN_GIVEN_X', True) params_prediction[ 'output_min_length_depending_on_x_factor'] = params.get( 'MINLEN_GIVEN_X_FACTOR', 2) params_prediction['attend_on_output'] = params.get( 'ATTEND_ON_OUTPUT', 'transformer' in params['MODEL_TYPE'].lower()) beam_searcher = BeamSearchEnsemble(models, dataset, params_prediction, model_weights=model_weights, verbose=args.verbose) scores = beam_searcher.scoreNet()[s] # Store result if args.dest is not None: filepath = args.dest # results file if params['SAMPLING_SAVE_MODE'] == 'list': list2file(filepath, scores) elif params['SAMPLING_SAVE_MODE'] == 'numpy': numpy2file(filepath, scores) else: raise Exception('The sampling mode ' + params['SAMPLING_SAVE_MODE'] + ' is not currently supported.') else: print(scores)