def build_model(tparams, options): """ @first:得到f_tt函数 """ old_options = load_config(options['nmt_model']) params = nmt.init_params(old_options) params = load_params(options['nmt_model'], params) old_tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt = nmt.build_model(old_tparams,old_options) hter = tensor.matrix('hter', dtype='float32') Wt = old_tparams['ff_logit_W'] #w2v = tensor.matrix('w2v',dtype='float32') n_timesteps = y.shape[0] n_samples = y.shape[1] emb = Wt.T[y.flatten()] emb = emb.reshape([n_timesteps, n_samples, 500]) emb = emb * tt #是否使用 dropout if options['use_dropout']: retain_probability_emb = 1 - options['dropout_embedding'] retain_probability_hidden = 1 - options['dropout_hidden'] retain_probability_source = 1 - options['dropout_source'] if options['model_version'] < 0.1: scaled = False else: scaled = True rec_dropout = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) source_dropout = shared_dropout_layer( (n_timesteps, n_samples, 1), use_noise, trng, retain_probability_source, scaled) source_dropout = tensor.tile(source_dropout, (1, 1, options['dim_word'])) else: rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) #if options['use_dropout']: # emb *= source_dropout #emb = get_qv_w2c(emb,y,w2v,dim=500) proj = gru_layer(tparams, emb, options, prefix='final_encoder', mask=y_mask, emb_dropout=emb_dropout, rec_dropout=rec_dropout, profile=False) hh = proj[0][-1, :, :] y_pred = tensor.dot(hh, tparams['final_W']) #此时得出的结果也不错 #y_pred = tensor.nnet.sigmoid(tensor.dot(hh,tparams['W'])) cost = tensor.abs_(y_pred - hter).mean(axis=0)[0] return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
def build_model(tparams, options): """ @first:得到f_tt函数 """ nmt_options = load_config(options['nmt_model']) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, nmt_cost, ctx, tt = nmt.build_model(tparams,nmt_options) # *** tparams *** #删除网络参数ff_logit_b,因为计算cost用不到此参数 tparams.pop('ff_logit_b') hter = tensor.matrix('hter', dtype='float32') Wt = tparams['ff_logit_W'] #old_tparams['ff_logit_W'] n_timesteps = y.shape[0] n_samples = y.shape[1] emb = Wt.T[y.flatten()] emb = emb.reshape([n_timesteps, n_samples, options['dim_word']]) emb = emb * tt #是否使用 dropout if options['use_dropout']: retain_probability_emb = 1 - options['dropout_embedding'] retain_probability_hidden = 1 - options['dropout_hidden'] retain_probability_source = 1 - options['dropout_source'] if options['model_version'] < 0.1: scaled = False else: scaled = True rec_dropout = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) source_dropout = shared_dropout_layer( (n_timesteps, n_samples, 1), use_noise, trng, retain_probability_source, scaled) source_dropout = tensor.tile(source_dropout, (1, 1, options['dim_word'])) else: rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) proj = gru_layer(tparams, emb, options, prefix='final_encoder', mask=y_mask, emb_dropout=emb_dropout, rec_dropout=rec_dropout, profile=False) hh = proj[0][-1, :, :] y_pred = tensor.dot(hh, tparams['final_W']) #此时得出的结果也不错 final_cost = tensor.abs_(y_pred - hter).mean(axis=0)[0] cost = final_cost return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
def encoder_hidden(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, decoderh = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_ctx = theano.function([x, x_mask], ctx, name='f_ctx') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) encoderh = f_ctx(x, x_mask) encoderh = numpy.concatenate( [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1) with open('features/hidden/train.en-es.encoderh', 'a+') as fp: for hh_data in encoderh: fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) encoderh = f_ctx(x, x_mask) encoderh = numpy.concatenate( [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1) with open('features/hidden/test.en-es.encoderh', 'a+') as fp: for hh_data in encoderh: fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def get_qv(model='model/model.npz.best_bleu'): """ @function:获得质量向量(quality vector) """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt = build_model(tparams,options) #加载数据 train = TextIterator( options['datasets'][0], options['datasets'][1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['batch_size'], maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False dev = TextIterator( options['valid_datasets'][0], options['valid_datasets'][1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['valid_batch_size'], maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt') #print tparams['ff_logit_W'].get_value().shape #### (500,40000) n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = tt_[i, 0, :].T * Wt[:, index] qv_.append(list(qv)) with open('qv/train/' + str(n_samples + j) + '.qv.pkl', 'w') as fp: pkl.dump(qv_, fp) n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def alignment( model='model/model.npz.best_bleu', train=['test/train.bpe.en','test/train.bpe.es'], test=['test/test.bpe.en','test/test.bpe.es'], batch_size=10 ): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator(train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator(test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_align = theano.function([x,x_mask,y,y_mask],opt_ret,name='f_cost') #################### train ####################### """ n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x) align = align * y_mask[:,:,None] # 注意此处技巧 align_shp = align.shape for j in range(align_shp[1]): row_ = int(numpy.sum(y_mask[:,j])) col_ = int(numpy.sum(x_mask[:,j])) align_data = align[:row_,j,:col_] # 词对齐矩阵 with open('features/alignment/train.en-es.word.align','a+') as fp: for data in align_data: fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n') fp.writelines('\n') n_samples += y.shape[1] print 'processed:',n_samples,'samples ...' """ ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x) align = align * y_mask[:,:,None] # 注意此处技巧 align_shp = align.shape for j in range(align_shp[1]): row_ = int(numpy.sum(y_mask[:,j])) col_ = int(numpy.sum(x_mask[:,j])) align_data = align[:row_,j,:col_] # 词对齐矩阵 with open('features/alignment/test.en-es.word.align','a+') as fp: for data in align_data: fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n') fp.writelines('\n') n_samples += y.shape[1] print 'processed:',n_samples,'samples ...'
def main(model, dictionary, dictionary_target, source_file, target_file, gold_align, saveto, k=5, pkl_file=None, normalize=False, output_attention=False): # load model model_options # if pkl_file is None: # pkl_file = model + '.pkl' # with open(pkl_file, 'rb') as f: # options = pkl.load(f) options = load_config(model) options['factor'] = 1 # load source dictionary and invert word_dict = load_dict(dictionary) word_idict = dict() # id2word for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # utility function def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) capsw.append(' '.join(ww)) return capsw def _send_jobs(fx_name, fy_name): retval = [] retval_ori = [] with open(fx_name, 'r') as fx, open(fy_name, 'r') as fy: for idx, (line_x, line_y) in enumerate(zip(fx, fy)): words = line_x.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options['n_words_src'] else 1, x) # x += [0] words = line_y.strip().split() y = map( lambda w: word_dict_trg[w] if w in word_dict_trg else 1, words) y = map(lambda ii: ii if ii < options['n_words'] else 1, y) # y += [0] retval_ori.append((line_x.strip(), line_y.strip())) retval.append((x, y)) return retval, retval_ori # load params param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # build model trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, options) inps = [x, x_mask, y, y_mask] # compile f_align logging.info('Building f_align...') # f_align= theano.function(inps, opt_ret['dec_alphas'], profile=profile) f_align = theano.function(inps, cost, profile=profile) logging.info('Done') print 'Processing ', source_file, '...' sys.stdout.flush() n_samples, n_samples_src = _send_jobs(source_file, target_file) atts = [] idx = 0 def _prepare_data(x, y): # print len(x), len(y) x = numpy.array([x]).T y = numpy.array([y]).T return x[None, :, :], numpy.ones_like( x, dtype='float32'), y, numpy.ones_like(y, dtype='float32') start_time = datetime.datetime.now() words = 0. for (x, y) in n_samples: # print x x, x_mask, y, y_mask = _prepare_data(x, y) att = f_align(x, x_mask, y, y_mask) # (len_y, nsample=1, len_x) # att = numpy.squeeze(att, 1) # atts.append(att.T) # ipdb.set_trace() # print idx # idx += 1 # if idx % 100 == 0: # print idx, # break last = datetime.datetime.now() - start_time print last.total_seconds(), len(n_samples) / last.total_seconds() def _force_decode(x, y): # sample given an input sequence and obtain scores att = f_force_decode(numpy.array(x)[:, None], numpy.array(y)[:, None]) _output_attention(0, att[0].squeeze(1).T) def _output_attention(sent_idx, att): dirname = saveto + '.attention' if not os.path.exists(dirname): os.mkdir(dirname) with open(dirname + '/' + str(sent_idx), 'w') as fp: fp.write("%d %d\n" % (att.shape[0], att.shape[1])) for row in att: # fp.write(str(row.argmax()) + " " + ' '.join([str(x) for x in row]) + '\n') fp.write('[' + ','.join([str(x) for x in row]) + '],') # fp.write(att) if output_attention: with open(saveto + '.att', 'w') as f: for idx, ((x, y), att) in enumerate(zip(n_samples_src, atts)): print >> f, ' '.join([ "{}:{}".format(idx + 1, hehe.argmax() + 1) for idx, hehe in enumerate(att) ]) # print >> f with open(saveto + '.att') as f_att, open(gold_align) as f_gold: AER = [] count_S, count_P, len_A, len_S = 0., 0., 0., 0. for idx, (cand, gold) in enumerate(zip(f_att, f_gold)): aer, count_s, count_p, len_a, len_s = calc_aer(cand, gold) AER.append(aer) count_S += count_s count_P += count_p len_A += len_a len_S += len_s ave_AER = numpy.average(AER) overall_AER = 1 - (count_S + count_P) / (len_A + len_S) print 'ave_AER ', ave_AER print 'overall_AER ', overall_AER ipdb.set_trace() print 'Done'
def extract_logprob(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_cost = theano.function([x, x_mask, y, y_mask], cost, name='f_cost') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) logprob = f_cost(x, x_mask, y, y_mask) with open('features/train.es-en.logprob', 'a+') as fp: fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) logprob = f_cost(x, x_mask, y, y_mask) with open('features/test.es-en.logprob', 'a+') as fp: fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def extract_qv(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得质量向量(quality vector) """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = Wt[:, index].T * tt_[i, j, :] qv_.append(list(qv)) qv_ = numpy.array(qv_) qv_ = list(map(lambda x: str(x), qv_.mean(axis=0))) with open('features/train.nmt.qv', 'a+') as fp: fp.writelines('\t'.join(qv_) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: #***** # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = Wt[:, index].T * tt_[i, j, :] qv_.append(list(qv)) qv_ = numpy.array(qv_) qv_ = list(map(lambda x: str(x), qv_.mean(axis=0))) with open('features/test.nmt.qv', 'a+') as fp: #***** fp.writelines('\t'.join(qv_) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'