def _score(pairs): # sample given an input sequence and obtain scores scores = [] for i, f_log_probs in enumerate(fs_log_probs): scores.append(pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize)) return scores
def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] for i, f_log_probs in enumerate(fs_log_probs): score_this_batch = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights) scores.append(score_this_batch) return scores
def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, f_log_probs in enumerate(fs_log_probs): score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalize=normalize, alignweights = alignweights) scores.append(score) alignments.append(alignment) return scores, alignments
def _score(pairs, alignweights=False): # sample given an input sequence and obtain scores scores = [] alignments = [] for i, model in enumerate(rescorer_settings.models): f_log_probs = load_scorer(model, options[i], alignweights=alignweights) score, alignment = pred_probs(f_log_probs, prepare_data, options[i], pairs, normalization_alpha=rescorer_settings.normalization_alpha, alignweights = alignweights) scores.append(score) alignments.append(alignment) return scores, alignments
def get_error(model, test_src, test_target): profile = False # reload options f = open('%s.pkl' % model, 'rb') model_options = pkl.load(f) logging.info(model_options) logging.info('Building model') params = init_params(model_options) # reload parameters params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] dict_src = os.path.join(model_options['baseDir'], model_options['dictionaries'][0]) if len(model_options['dictionaries']) == 1: dict_target = None else: dict_target = os.path.join(model_options['baseDir'], model_options['dictionaries'][1]) valid = TextIterator(test_src, test_target, dict_src, dict_target, n_words_source=model_options['n_words_src'], n_words_target=model_options['n_words'], batch_size=model_options['valid_batch_size'], maxlen=model_options['maxlen']) logging.info('Building f_log_probs...') f_log_probs = theano.function(inps, cost, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() logging.info('Valid Error:%s' % (str(valid_err)))
def get_error(model, test_src, test_target): profile=False # reload options f = open('%s.pkl' % model, 'rb') model_options = pkl.load(f) logging.info(model_options) logging.info('Building model') params = init_params(model_options) # reload parameters params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] dict_src = os.path.join(model_options['baseDir'], model_options['dictionaries'][0]) if len(model_options['dictionaries']) == 1: dict_target = None else: dict_target = os.path.join(model_options['baseDir'], model_options['dictionaries'][1]) valid = TextIterator(test_src, test_target, dict_src, dict_target, n_words_source=model_options['n_words_src'], n_words_target=model_options['n_words'], batch_size=model_options['valid_batch_size'], maxlen=model_options['maxlen']) logging.info('Building f_log_probs...') f_log_probs = theano.function(inps, cost, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err = valid_errs.mean() logging.info('Valid Error:%s'% (str(valid_err)))
def _score(pairs, alignweights=True): # sample given an input sequence and obtain scores scores = [] sent_alignments = [] costs_per_word = [] for i, model in enumerate(models): f_log_probs = load_scorer(model, options[i], alignweights=alignweights) # TODO: make multi ? score, alignments, cost_per_word = pred_probs( f_log_probs, prepare_data, options[i], pairs, normalization_alpha=normalization_alpha, alignweights=alignweights) scores.append(score) sent_alignments.append(alignments) costs_per_word.append(cost_per_word) return scores, sent_alignments, costs_per_word
def main(model, src_dict, target_dict, source_file, target_file, saveto, source_word_level=1, target_word_level=0, valid_batch_size=128, n_words_src=302, n_words=302): from char_base import (init_params, build_model, build_sampler) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from nmt import (pred_probs, prepare_data) # load model model_options pkl_file = model.split('.')[0] + '.pkl' with open(pkl_file, 'rb') as f: options = pkl.load(f) trng = RandomStreams(1234) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) # create shared variables for parameters tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, options) inps = [x, x_mask, y, y_mask] print 'Building sampler...\n', f_init, f_next = build_sampler(tparams, options, trng, use_noise) print 'Done' # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost) print 'Done' print('Preparing dataset...') dataset = TextIterator(source=source_file, target=target_file, source_dict=src_dict, target_dict=target_dict, n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=valid_batch_size, sort_size=sort_size) print('Predicting probs...') log_probs = pred_probs(f_log_probs, prepare_data, options, dataset, verboseFreq=10000) print('Done...') output_file = open(saveto, 'w') pwd_cnt = 0 for line in open(target_file): output_file.writelines(line.rstrip() + '\t' + str(1.0 / (math.e**log_probs[pwd_cnt])) + '\n') pwd_cnt += 1 """ for prob in log_probs: output_file.writelines(str(prob) + '\n') """ output_file.flush() output_file.close() print('Evaluation finished...')
def main(model_dir, model_pkl, model_grads, dict_src, dict_trg, hyp_filename, saveto, n_words_src, n_words, workdir): print 'Loading model.' model_file = os.path.join(model_dir, model_pkl) with open(model_file, 'rb') as f: model_options = pkl.load(f) param_file = os.path.join(model_dir, model_grads) params = init_params(model_options) params = load_params(param_file, params) tparams = init_tparams(params) # load dictionary and invert with open(dict_src, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk with open(dict_trg, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk temp_dir = workdir print 'Using temp directory', temp_dir hyp_src_fname = os.path.join( temp_dir, '%s.src.%d' % (os.path.basename(hyp_filename), int(time.time()))) hyp_trg_fname = os.path.join( temp_dir, '%s.trg.%d' % (os.path.basename(hyp_filename), int(time.time()))) print 'hyp temp:', hyp_src_fname print 'hyp temp:', hyp_trg_fname hyp_src = open(hyp_src_fname, 'w') hyp_trg = open(hyp_trg_fname, 'w') with open(hyp_filename, 'r') as f: for line in f: toks = line.strip().split('\t') hyp_src.write('%s\n' % toks[0].strip()) hyp_trg.write('%s\n' % toks[1].strip()) hyp_src.close() hyp_trg.close() test = TextIterator(source=hyp_src_fname, target=hyp_trg_fname, source_dict=dict_src, target_dict=dict_trg, n_words_source=n_words_src, n_words_target=n_words, source_word_level=0, target_word_level=0, batch_size=1, sort_size=1) #?? dunno what this param does print 'Building model...\n', trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) inps = [x, x_mask, y, y_mask] ''' # TODO maybe don't need this f_init, f_next = build_sampler(tparams, model_options, trng, use_noise) ''' print 'Building f_log_probs...' f_log_probs = theano.function(inps, cost, profile=profile) use_noise.set_value(0.) test_scores = pred_probs(f_log_probs, prepare_data, model_options, test, 5) print test_scores.mean() os.remove(hyp_src_fname) os.remove(hyp_trg_fname) test_scores = [str(f) for f in test_scores] with open(saveto, 'w') as f: f.write(u'\n'.join(test_scores).encode('utf-8')) f.write(u'\n') print "Done", saveto
def main(model, pklmodel, valid_datasets=['../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'], dictionary_chunk='/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', result_file='./cost.result'): # load the dictionaries of both source and target # load dictionaries and invert them worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = pkl.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk # dict for chunk label worddict_chunk = [None] worddict_r_chunk = [None] with open(dictionary_chunk, 'rb') as f: worddict_chunk = pkl.load(f) worddict_r_chunk = dict() for kk, vv in worddict_chunk.iteritems(): worddict_r_chunk[vv] = kk print worddict_chunk print 'load model model_options' with open('%s' % pklmodel, 'rb') as f: options = pkl.load(f) # build valid set valid = TrainingTextIterator(valid_datasets[0], valid_datasets[1], dictionaries[0], dictionaries[1], dictionary_chunk, n_words_source=options['n_words_src'], n_words_target=options['n_words'], batch_size=options['batch_size'], max_chunk_len=options['maxlen_chunk'], max_word_len=options['maxlen_chunk_words']) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator, \ opt_ret, \ cost, cost_cw= \ build_model(tparams, options) inps = [x, x_mask, y_chunk, y_mask, y_cw, y_chunk_indicator] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=False) f_log_probs_cw = theano.function(inps, cost_cw, profile=False) print 'Done' valid_errs, valid_errs_cw = pred_probs(f_log_probs, f_log_probs_cw, prepare_training_data, options, valid) valid_err = valid_errs.mean() valid_err_cw = valid_errs_cw.mean() with open(result_file, 'w') as result_file: print >> result_file, valid_err, valid_err_cw
def main(model, dictionary, dictionary_target, source, target, outfile, wordbyword): # load model model_options with open('%s.pkl' % model, 'rb') as f: options = pkl.load(f) """ # load source dictionary and invert with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load target dictionary and invert with open(dictionary_target, 'rb') as f: word_dict_trg = pkl.load(f) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' """ valid_noshuf = TextIterator(source, target, dictionary, dictionary_target, n_words_source=options['n_words_src'], n_words_target=options['n_words'], batch_size=options['valid_batch_size'], maxlen=2000, shuffle=False) # allocate model parameters params = init_params(options) # load model parameters and set theano shared variables params = load_params(model, params) tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost, cost_ = \ build_model(tparams, options) inps = [x, x_mask, y, y_mask] if wordbyword: f_log_probs = theano.function(inps, cost_, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, options, valid_noshuf, verbose=True, as_list=True) with open(outfile, 'wb') as f: pkl.dump(valid_errs, f, pkl.HIGHEST_PROTOCOL) else: f_log_probs = theano.function(inps, cost, profile=profile) valid_errs = pred_probs(f_log_probs, prepare_data, options, valid_noshuf, verbose=True) numpy.save(outfile, valid_errs)