def init_params(options): params = OrderedDict() #获取nmt的参数 nmt_options = load_config(options['nmt_model']) nmt_params = nmt.init_params(nmt_options) nmt_params = load_params(options['nmt_model'], nmt_params) for key in nmt_params: params[key] = nmt_params[key] #获取final_rnn的参数 final_options = load_config(options['final_model']) final_params = rnn.init_params(final_options) final_params = load_params(options['final_model'], final_params) for key in final_params: params[key] = final_params[key] return params
def build_model(tparams, options): """ @first:得到f_tt函数 """ old_options = load_config(options['nmt_model']) params = nmt.init_params(old_options) params = load_params(options['nmt_model'], params) old_tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt = nmt.build_model(old_tparams,old_options) hter = tensor.matrix('hter', dtype='float32') Wt = old_tparams['ff_logit_W'] #w2v = tensor.matrix('w2v',dtype='float32') n_timesteps = y.shape[0] n_samples = y.shape[1] emb = Wt.T[y.flatten()] emb = emb.reshape([n_timesteps, n_samples, 500]) emb = emb * tt #是否使用 dropout if options['use_dropout']: retain_probability_emb = 1 - options['dropout_embedding'] retain_probability_hidden = 1 - options['dropout_hidden'] retain_probability_source = 1 - options['dropout_source'] if options['model_version'] < 0.1: scaled = False else: scaled = True rec_dropout = shared_dropout_layer((2, n_samples, options['dim']), use_noise, trng, retain_probability_hidden, scaled) emb_dropout = shared_dropout_layer((2, n_samples, options['dim_word']), use_noise, trng, retain_probability_emb, scaled) source_dropout = shared_dropout_layer( (n_timesteps, n_samples, 1), use_noise, trng, retain_probability_source, scaled) source_dropout = tensor.tile(source_dropout, (1, 1, options['dim_word'])) else: rec_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) emb_dropout = theano.shared(numpy.array([1.] * 2, dtype='float32')) #if options['use_dropout']: # emb *= source_dropout #emb = get_qv_w2c(emb,y,w2v,dim=500) proj = gru_layer(tparams, emb, options, prefix='final_encoder', mask=y_mask, emb_dropout=emb_dropout, rec_dropout=rec_dropout, profile=False) hh = proj[0][-1, :, :] y_pred = tensor.dot(hh, tparams['final_W']) #此时得出的结果也不错 #y_pred = tensor.nnet.sigmoid(tensor.dot(hh,tparams['W'])) cost = tensor.abs_(y_pred - hter).mean(axis=0)[0] return trng, use_noise, x, x_mask, y, y_mask, hter, y_pred, cost
def encoder_hidden(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, decoderh = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_ctx = theano.function([x, x_mask], ctx, name='f_ctx') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) encoderh = f_ctx(x, x_mask) encoderh = numpy.concatenate( [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1) with open('features/hidden/train.en-es.encoderh', 'a+') as fp: for hh_data in encoderh: fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) encoderh = f_ctx(x, x_mask) encoderh = numpy.concatenate( [encoderh[0, :, 1024:], encoderh[-1, :, :1024]], axis=1) with open('features/hidden/test.en-es.encoderh', 'a+') as fp: for hh_data in encoderh: fp.writelines('\t'.join(map(lambda x: str(x), list(hh_data))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def get_qv(model='model/model.npz.best_bleu'): """ @function:获得质量向量(quality vector) """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt = build_model(tparams,options) #加载数据 train = TextIterator( options['datasets'][0], options['datasets'][1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['batch_size'], maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False dev = TextIterator( options['valid_datasets'][0], options['valid_datasets'][1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=options['valid_batch_size'], maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt') #print tparams['ff_logit_W'].get_value().shape #### (500,40000) n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = tt_[i, 0, :].T * Wt[:, index] qv_.append(list(qv)) with open('qv/train/' + str(n_samples + j) + '.qv.pkl', 'w') as fp: pkl.dump(qv_, fp) n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def alignment( model='model/model.npz.best_bleu', train=['test/train.bpe.en','test/train.bpe.es'], test=['test/test.bpe.en','test/test.bpe.es'], batch_size=10 ): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator(train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator(test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_align = theano.function([x,x_mask,y,y_mask],opt_ret,name='f_cost') #################### train ####################### """ n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x) align = align * y_mask[:,:,None] # 注意此处技巧 align_shp = align.shape for j in range(align_shp[1]): row_ = int(numpy.sum(y_mask[:,j])) col_ = int(numpy.sum(x_mask[:,j])) align_data = align[:row_,j,:col_] # 词对齐矩阵 with open('features/alignment/train.en-es.word.align','a+') as fp: for data in align_data: fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n') fp.writelines('\n') n_samples += y.shape[1] print 'processed:',n_samples,'samples ...' """ ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) align = f_align(x,x_mask,y,y_mask)['dec_alphas'] # (y, batch_size, x) align = align * y_mask[:,:,None] # 注意此处技巧 align_shp = align.shape for j in range(align_shp[1]): row_ = int(numpy.sum(y_mask[:,j])) col_ = int(numpy.sum(x_mask[:,j])) align_data = align[:row_,j,:col_] # 词对齐矩阵 with open('features/alignment/test.en-es.word.align','a+') as fp: for data in align_data: fp.writelines('\t'.join(map(lambda x:str(x), data))+'\n') fp.writelines('\n') n_samples += y.shape[1] print 'processed:',n_samples,'samples ...'
def word_embedding( model='model/model.npz.best_bleu', train=['test/train.bpe.en','test/train.bpe.es'], dev=['test/dev.bpe.en','test/dev.bpe.es'], test=['test/test.bpe.en','test/test.bpe.es'], batch_size=10 ): """ @function:获得词向量 """ options = load_config(model) # 加载设置的超参数 params = init_params(options) params = load_params(model, params) # 加载模型参数 #加载数据 train = TextIterator(train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False dev = TextIterator(dev[0], dev[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator(test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False #################### train ####################### Wemb = params['Wemb'] Wemb_dec = params['Wemb_dec'] n_samples = 0 for x, y in train: x_emb = get_emb(x, Wemb) y_emb = get_emb(y, Wemb_dec) with open('features/emb/train.es-en.es.emb','a+') as fp: for x_row in x_emb: fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n') with open('features/emb/train.es-en.en.emb','a+') as fp: for y_row in y_emb: fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n') n_samples += len(x) print 'processed:',n_samples,'samples ...' ################### test ######################## Wemb = params['Wemb'] Wemb_dec = params['Wemb_dec'] n_samples = 0 for x, y in test: x_emb = get_emb(x, Wemb) y_emb = get_emb(y, Wemb_dec) with open('features/emb/test.es-en.es.emb','a+') as fp: for x_row in x_emb: fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n') with open('features/emb/test.es-en.en.emb','a+') as fp: for y_row in y_emb: fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n') n_samples += len(x) print 'processed:',n_samples,'samples ...' ################### dev ######################## Wemb = params['Wemb'] Wemb_dec = params['Wemb_dec'] n_samples = 0 for x, y in dev: x_emb = get_emb(x, Wemb) y_emb = get_emb(y, Wemb_dec) with open('features/emb/dev.es-en.es.emb','a+') as fp: for x_row in x_emb: fp.writelines('\t'.join(map(lambda x:str(x), x_row))+'\n') with open('features/emb/dev.es-en.en.emb','a+') as fp: for y_row in y_emb: fp.writelines('\t'.join(map(lambda x:str(x), y_row))+'\n') n_samples += len(x) print 'processed:',n_samples,'samples ...'
def extract_logprob(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得对数似然特征 """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_cost = theano.function([x, x_mask, y, y_mask], cost, name='f_cost') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) logprob = f_cost(x, x_mask, y, y_mask) with open('features/train.es-en.logprob', 'a+') as fp: fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) logprob = f_cost(x, x_mask, y, y_mask) with open('features/test.es-en.logprob', 'a+') as fp: fp.writelines('\n'.join(map(lambda x: str(x), list(logprob))) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'
def extract_qv(model='model/model.npz.best_bleu', train=['test/train.bpe.en', 'test/train.bpe.es'], test=['test/test.bpe.en', 'test/test.bpe.es'], batch_size=10): """ @function:获得质量向量(quality vector) """ options = load_config(model) params = init_params(options) params = load_params(model, params) tparams = init_theano_params(params) trng,use_noise,x,x_mask,y,y_mask,\ opt_ret, cost, ctx, tt, _ = build_model(tparams,options) #加载数据 train = TextIterator( train[0], train[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False test = TextIterator( test[0], test[1], options['dictionaries'][0], options['dictionaries'][1], n_words_source=options['n_words_src'], n_words_target=options['n_words_tgt'], batch_size=batch_size, maxlen=1000, #设置尽可能长的长度 sort_by_length=False) #设为 False f_tt = theano.function([x, x_mask, y, y_mask], tt, name='f_tt') #################### train ####################### n_samples = 0 for x, y in train: # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = Wt[:, index].T * tt_[i, j, :] qv_.append(list(qv)) qv_ = numpy.array(qv_) qv_ = list(map(lambda x: str(x), qv_.mean(axis=0))) with open('features/train.nmt.qv', 'a+') as fp: fp.writelines('\t'.join(qv_) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...' ################### test ######################## n_samples = 0 for x, y in test: #***** # 准备数据用于训练 x, x_mask, y, y_mask = prepare_data(x, y, maxlen=1000, n_words_src=options['n_words_src'], n_words=options['n_words_tgt']) tt_ = f_tt(x, x_mask, y, y_mask) Wt = tparams['ff_logit_W'].get_value() for j in range(y.shape[1]): qv_ = [] for i in range(y.shape[0]): if y_mask[i][j] == 1: index = y[i][j] qv = Wt[:, index].T * tt_[i, j, :] qv_.append(list(qv)) qv_ = numpy.array(qv_) qv_ = list(map(lambda x: str(x), qv_.mean(axis=0))) with open('features/test.nmt.qv', 'a+') as fp: #***** fp.writelines('\t'.join(qv_) + '\n') n_samples += y.shape[1] print 'processed:', n_samples, 'samples ...'