def evaluate(): # Load model weight_path = 'model/09031344_epoch_4_train_loss_3.7933.h5' # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() model = TransformerModel(in_vocab_len=len(idx2de), out_vocab_len=len(idx2en), max_len=hp.maxlen) model.load_model(weight_path) for i in range(len(X) // hp.batch_size): x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] preds = model.translate(x, idx2en) for source, target, pred in zip(sources, targets, preds): print('source:', source) print('expected:', target) print('pred:', pred) print()
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # X, Sources, Targets = X[:33], Sources[:33], Targets[:33] # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name ## Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/" + mname, "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): ### Get mini-batches x = X[i*hp.batch_size: (i+1)*hp.batch_size] sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size] targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size] ### Autoregressive inference preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) # (32, 10) for j in range(hp.maxlen): _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] ### Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise #print(got) got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source +"\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) ## Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100*score))
def evaluate_train(): # Load model weight_path = 'model/09031925_epoch_0_train_loss_5.9855.h5' # Load data Sources, Targets = load_train_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() batch_size = 5 model = TransformerModel(in_vocab_len=len(idx2de), out_vocab_len=len(idx2en), max_len=hp.maxlen) model.load_model(weight_path) for i in range(5 // batch_size): x = Sources[i * batch_size:(i + 1) * batch_size] sources = Sources[i * batch_size:(i + 1) * batch_size] targets = Targets[i * batch_size:(i + 1) * batch_size] preds = model.translate_with_ans(sources, targets, idx2en) # preds = model.translate(x, idx2en) for source, target, pred in zip(sources, targets, preds): print('source:', ' '.join(idx2de[idx] for idx in source)) print('expected:', ' '.join(idx2en[idx] for idx in target)) print('pred:', pred) print()
def syn_train_api(): de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Construct graph g = Graph("train") print("Graph loaded") with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(model_path)) print("Restored!") # Start session if sv.should_stop(): break for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): sess.run(g.train_op) loss = sess.run(g.mean_loss) print("============loss=========: %f" % loss) gs = sess.run(g.global_step) sv.saver.save(sess, tf.train.latest_checkpoint(model_path)) print(sess.run(g.acc)) print("Done")
def test(config): _config_test(config) de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() model = ConvSeq2Seq(config) graph_handler = GraphHandler(config) inferencer = Inferencer(config, model) sess = tf.Session() graph_handler.initialize(sess) global_step = 0 refs = [] hypotheses = [] with codecs.open(os.path.join(config.eval_dir, config.model_name), "w", "utf-8") as fout: for i, batch in tqdm(enumerate(get_batch_for_test())): preds = inferencer.run(sess, batch) sources = batch['source'] targets = batch['target'] for source, target, pred in zip(sources, targets, preds): got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source +"\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: refs.append([ref]) hypotheses.append(hypothesis) score = corpus_bleu(refs, hypotheses) fout.write("Bleu Score = " + str(100*score))
def train(): current_batches = 0 de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() enc_voc = len(de2idx) dec_voc = len(en2idx) writer = SummaryWriter() # Load data X, Y = load_train_data() # calc total batch count num_batch = len(X) // hp.batch_size model = AttModel(hp, enc_voc, dec_voc) model.train() model.cuda() torch.backends.cudnn.benchmark = True if not os.path.exists(hp.model_dir): os.makedirs(hp.model_dir) if hp.preload is not None and os.path.exists(hp.model_dir + '/history.pkl'): with open(hp.model_dir + '/history.pkl', 'rb') as in_file: history = pickle.load(in_file) else: history = {'current_batches': 0} current_batches = history['current_batches'] optimizer = optim.Adam(model.parameters(), lr=hp.lr, betas=[0.9, 0.98], eps=1e-8) if hp.preload is not None and os.path.exists(hp.model_dir + '/optimizer.pth'): optimizer.load_state_dict(torch.load(hp.model_dir + '/optimizer.pth')) if hp.preload is not None and os.path.exists(hp.model_dir + '/model_epoch_%02d.pth' % hp.preload): model.load_state_dict(torch.load(hp.model_dir + '/model_epoch_%02d.pth' % hp.preload)) startepoch = int(hp.preload) if hp.preload is not None else 1 for epoch in range(startepoch, hp.num_epochs + 1): current_batch = 0 for index, current_index in get_batch_indices(len(X), hp.batch_size): tic = time.time() x_batch = Variable(torch.LongTensor(X[index]).cuda()) y_batch = Variable(torch.LongTensor(Y[index]).cuda()) toc = time.time() tic_r = time.time() torch.cuda.synchronize() optimizer.zero_grad() loss, _, acc = model(x_batch, y_batch) loss.backward() optimizer.step() torch.cuda.synchronize() toc_r = time.time() current_batches += 1 current_batch += 1 if current_batches % 10 == 0: writer.add_scalar('./loss', loss.data.cpu().numpy(), current_batches) writer.add_scalar('./acc', acc.data.cpu().numpy(), current_batches) if current_batches % 5 == 0: print('epoch %d, batch %d/%d, loss %f, acc %f' % (epoch, current_batch, num_batch, loss.data[0], acc.data[0])) print('batch loading used time %f, model forward used time %f' % (toc - tic, toc_r - tic_r)) if current_batches % 100 == 0: writer.export_scalars_to_json(hp.model_dir + '/all_scalars.json') with open(hp.model_dir + '/history.pkl', 'wb') as out_file: pickle.dump(history, out_file) checkpoint_path = hp.model_dir + '/model_epoch_%02d' % epoch + '.pth' torch.save(model.state_dict(), checkpoint_path) torch.save(optimizer.state_dict(), hp.model_dir + '/optimizer.pth')
def eval(): # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() enc_voc = len(de2idx) dec_voc = len(en2idx) # load model model = AttModel(hp, enc_voc, dec_voc) model.load_state_dict( torch.load(hp.model_dir + '/model_epoch_%02d' % hp.eval_epoch + '.pth')) print('Model Loaded.') model.eval() model.cuda() # Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open('results/model%d.txt' % hp.eval_epoch, 'w', 'utf-8') as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): # Get mini-batches x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] # Autoregressive inference x_ = torch.LongTensor(x).cuda() preds_t = torch.LongTensor( np.zeros((hp.batch_size, hp.maxlen), np.int32)).cuda() preds = preds_t for j in range(hp.maxlen): _, _preds, _ = model(x_, preds) preds_t[:, j] = _preds.data[:, j] preds = preds_t.long() preds = preds.data.cpu().numpy() # Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source + "\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) # Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100 * score))
def __init__(self, transformerModel, output_dir): self.transformerModel = transformerModel self.output_dir = output_dir self.Sources, self.Targets = load_train_data() _, self.idx2de = load_de_vocab() _, self.idx2en = load_en_vocab() os.makedirs(self.output_dir, exist_ok=True)
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # X, Sources, Targets = X[:33], Sources[:33], Targets[:33] # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name ## Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/" + mname, "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): ### Get mini-batches x = X[i*hp.batch_size: (i+1)*hp.batch_size] sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size] targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size] ### Autoregressive inference preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) for j in range(hp.maxlen): _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] ### Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source +"\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) ## Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100*score))
def eval2(): # 加载 g = Graph(is_training=False) print("Graph loaded") # 加载数据 X, Sources, Targets = load_test_data1() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # 开始阶段 with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: ## 恢复参数 sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## 模型名取得 mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name ## Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/eval2", "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): ### 获取最小Batch x = X[i*hp.batch_size: (i+1)*hp.batch_size] sources = Sources[i*hp.batch_size: (i+1)*hp.batch_size] targets = Targets[i*hp.batch_size: (i+1)*hp.batch_size] ### 自回归 preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) for j in range(hp.maxlen): _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] ### 写 for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source +"\n") #fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") print("- source: " + source +"\n") #print("- expected: " + target + "\n") print("- got: " + got + "\n\n") fout.flush() #得分 ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis)
def train(): # Load graph g = Graph(is_training=True) de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() print("Graph loaded") # Load data X, Image_index, _, X_target = load_cap_data(set="en") images = np.load(image_path.format("train")) # x_val,Image_index_val,_,Targets = load_test_cap_data(set="test",language="en") # val_images = np.load("../image/task1_ResNet50_res4fx_test2017.fp16.npy") # num_batch_val = len(x_val)//hp.batch_size # smoothie = SmoothingFunction().method2 # Start session num_batch = int(math.ceil(len(X) / hp.batch_size)) if not os.path.exists(hp.logdir_cap_en): os.mkdir(hp.logdir_cap_en) with g.graph.as_default(): saver = tf.train.Saver(var_list=g.value_list, max_to_keep=40) with tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: sess.run(tf.global_variables_initializer()) # saver.restore(sess, tf.train.latest_checkpoint("logdir_en2")) # print("Restored!") ## train for epoch in range(hp.num_epochs): for i in range(num_batch): lr = hp.lr_cap * pow(0.95, epoch) step = epoch * num_batch + i ### Get mini-batches image = images[Image_index[i * hp.batch_size:(i + 1) * hp.batch_size]] x = X[i * hp.batch_size:(i + 1) * hp.batch_size] x_target = X_target[i * hp.batch_size:(i + 1) * hp.batch_size] feed_dict = { g.x: x, g.image: image, g.dropout_rate: hp.dropout_rate, g.lstm_drop_rate: hp.lstm_drop_rate, g.lr: lr, g.x_target: x_target } if i % 1000 == 0: _, loss, preds = sess.run( [g.train_op, g.loss, g.preds_list], feed_dict) with open("en.txt", "a+") as f: f.write("loss {}".format(i) + " " + str(loss)) else: sess.run(g.train_op, feed_dict) if (step + 1) % 1000 == 0: saver.save(sess, save_path=hp.logdir_cap_en + '/model_step_%d' % (step))
def __init__(self, is_training): self.de2idx, _idx2de = load_de_vocab() self.en2idx, _idx2en = load_en_vocab() self.is_training = is_training self.graph = tf.Graph() with self.graph.as_default(): if self.is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.x_len = tf.reduce_sum(self.x, axis=-1) self.y_len = tf.reduce_sum(self.y, axis=-1) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.batch_size = tf.shape(self.x)[0]
def eval(): g = train_Graph(is_training=False) print('Graph loaded') X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print('Restored') mname = open(hp.logdir + '/checkpoint', 'r').read().split('')[1] # model name if not os.path.exists('results'): os.mkdir('results') with codecs.open('results/' + mname, 'w', 'utf-8') as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): x = X[i*hp.batch_size:(i+1)*hp.batch_size] sources = Sources[i*hp.batch_size:(i+1)*hp.batch_size] targets = Targets[i*hp.batch_size:(i+1)*hp.batch_size] preds = np.zeros((hp.batch_size, hp.max_seq_len), np.int32) for j in range(hp.max_seq_len): '''每个词每个词地预测。这样,后一个词预测的时候就可以利用前面的信息来解码。 所以一共循环hp.max_len次,每次循环用之前的翻译作为解码器的输入翻译的一个词。''' _preds = sess.run(g.preds, {g.x:x, g.y:preds}) preds[:, j] = _preds[:, j] for source, target, pred in zip(sources, targets, preds): got = ''.join(idx2en[idx] for idx in pred).split('</S>')[0].strip() fout.write('-source:' + source + '\n') fout.write('-expected:' + target + '\n') fout.write('-got:' + got + '\n\n') fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) score = corpus_bleu(list_of_refs, hypotheses) fout.write('Bleu Score = ' + str(100*score))
def create_data(input_sent): de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() x = [] y = [] if len(input_sent) < (hp.maxlen - 1): x.append(de2idx.get("<S>", 1)) for each in input_sent: x.append(de2idx.get(each, 1)) x.append(de2idx.get("</S>", 1)) y.append(np.array(x)) y = np.array(y) print(y.shape) Input = [] Input.append(input_sent) X = np.zeros([len(y), hp.maxlen], np.int32) print(X.shape) X[0] = np.lib.pad(y[0], [0, hp.maxlen - len(y[0])], 'constant', constant_values=0) print(X.shape) return X, Input
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data( ) # shape=[batch_size, max_seq_len] else: self.x = tf.placeholder(tf.int32, shape=(None, hp.max_seq_len)) self.y = tf.placeholder(tf.int32, shape=(None, hp.max_seq_len)) # decoder_inputs '''decoder_inputs和self.y相比,去掉了最后一个句子结束符,而在每句话最前面加了一个初始化为2的id,即<S> ,代表开始。''' self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), axis=-1) # load_vocab de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # encoder with tf.variable_scope('encoder'): # input - word embedding self.enc = embedding(self.x, vocab_size=len(de2idx), d_model=hp.d_model, scale=True, scope='enc_embed') # input - positional encoding self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.max_seq_len, d_model=hp.d_model, zero_pad=False, scale=False, scope='enc_pe') # Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) # 3. num_layers multi-head attention for i in range(hp.num_layers): with tf.variable_scope('num_layers_{}'.format(i)): # multi head attention + Add and Norm self.enc = multihead_attention( queries=self.enc, keys=self.enc, d_model=hp.d_model, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) # feed forward + Add and Norm self.enc = feedforward( self.enc, dff=[4 * hp.d_model, hp.d_model]) # decoder with tf.variable_scope('decoder'): self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), d_model=hp.d_model, scale=True, scope='dec_embed') self.dec += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.max_seq_len, d_model=hp.d_model, zero_pad=False, scale=False, scope='dec_pe') self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) for i in range(hp.num_layers): with tf.variable_scope('num_layers_{}'.format(i)): # masked multi-head attention self.dec = multihead_attention( queries=self.dec, keys=self.dec, d_model=hp.d_model, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope='self-attention') # multi-head attention self.dec = multihead_attention( queries=self.dec, keys=self.enc, d_model=hp.d_model, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope='vanilla-attention') self.dec = feedforward( self.dec, dff=[4 * hp.d_model, hp.d_model ]) # shape=[batch_size, seq_len, d_model] # final linear projection self.logits = tf.layers.dense( self.dec, len(en2idx)) # shape=[batch_size, seq_len, target_vocab_size] self.preds = tf.to_int32(tf.arg_max( self.logits, dimension=-1)) # 预测值 shape=[batch_size, seq_len] self.istarget = tf.to_float(tf.not_equal( self.y, 0)) # 真实值 shape=[batch_size, seq_len] # pad 部分不参与准确率计算 self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / tf.reduce_sum(self.istarget) tf.summary.scalar('acc', self.acc) if is_training: # loss self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) # pad 部分不参与损失计算 self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # training scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为 # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分 # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。 # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果 self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad= True, # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0) scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks, 叠加block,6个 for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") # Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) # Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection, 分类任务,分类数量是词表长度 self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
reduction='none') def loss_fun(y_true, y_pred): mask = tf.math.logical_not(tf.math.equal(y_true, 0)) loss_ = loss_object(y_true, y_pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) # 用于记录损失和准确率 train_loss = tf.keras.metrics.Mean(name='train_loss') train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy') de2index, index2de = load_de_vocab() en2index, index2en = load_en_vocab() input_vocab_size = len(de2index) target_vocab_size = len(en2index) transformer = Transformer(hp.d_model, hp.num_layers, hp.num_heads, hp.dff, input_vocab_size, target_vocab_size, hp.max_seq_len, hp.dropout_rate) # 创建checkpoint管理器 ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, hp.ckpt_path, max_to_keep=3) if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint) print('Load last checkpoint restore') '''
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # X, Sources, Targets = X[:33], Sources[:33], Targets[:33] # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name ## Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/" + mname, "w", "utf-8") as fout: print("open reults success\n") list_of_refs, hypotheses = [], [] print("length of batch is" + str(len(X) // hp.batch_size)) for i in range(len(X) // hp.batch_size): print('translating') ### Get mini-batches x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] ### Autoregressive inference # 循环结束后,这个batch的句子的翻译保存在preds中 preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) for j in range(hp.maxlen): _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] ### Write to file # 翻译完成后把句子的翻译保存到preds中 for source, target, pred in zip(sources, targets, preds): # sentence-wise #将pred(preds中的每个句子)的每个id转化为其对应的而英文单词,然后将这些单词字符串用一个空格字符链接起来,同时去掉句尾结束符。即得到了翻译的由词组成的句子。 got = " ".join( idx2en[idx] for idx in pred).split("</S>")[0].strip() # 分别将原句子、期望翻译的结果、实际翻译的结果写入文件 fout.write("- source: " + source + "\n") print('\n' + '\n' + '\n' + source + '\n' + '\n' + '\n') fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) ## Calculate bleu score # 最后计算bleu score并写入文件 # 将两者长度都大于3的句子加入到总的列表中,作为计算Bleu的参数,由此得到bleu socre.可以用以评估模型。 str_hyp = ",".join(hypotheses) print("len of hypothese is :" + len(hypotheses)) score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100 * score)) #将bleu score写入文件末尾
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() en2idx, idx2en = load_en_vocab() de2idx, idx2de = load_de_vocab() # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: # Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") # Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name # Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/" + mname, "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): # Get mini-batches x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] # Autoregressive inference preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) for j in range(hp.maxlen): tensors = [g.preds] + list( g.tensors_of_interest.values()) tensors_out = sess.run(tensors, {g.x: x, g.y: preds}) _preds = tensors_out[0] preds[:, j] = _preds[:, j] print([idx2de[idx] for idx in preds[0]]) # For the first few batches, we save figures giving the attention structure in the encoder. if j == 0 and i < batches_to_visualize: tensor_keys = [None] + list( g.tensors_of_interest.keys() ) # Add a null key at the start so it lines up with the tensors_out list visualizeEncoderAttention( sources=sources, idx2en=idx2en, tensors_of_interest={ key: value for key, value in zip( tensor_keys, tensors_out) }, batch_index=i) # Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join( idx2de[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source + "\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) # Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100 * score))
self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all() if __name__ == '__main__': # Load vocabulary de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Construct graph g = Graph("train"); print("Graph loaded") # Start session sv = tf.train.Supervisor(graph=g.graph, logdir=hp.logdir, save_model_secs=0) with sv.managed_session() as sess: for epoch in range(1, hp.num_epochs+1): if sv.should_stop(): break for step in tqdm(range(g.num_batch), total=g.num_batch, ncols=70, leave=False, unit='b'): sess.run(g.train_op)
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # X, Sources, Targets = X[:33], Sources[:33], Targets[:33] # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name ## Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/" + mname, "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): ### Get mini-batches x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] predx = np.zeros((hp.batch_size, hp.maxlen, hp.beam_width), np.int32) predx_prob = np.zeros_like(predx, np.float64) logits = np.zeros((hp.batch_size, hp.maxlen, len(en2idx)), np.float64) print(x[1:2, :]) for j in range( hp.batch_size ): #For testing, the range will be changed to accelerate the testing for j in range(hp.maxlen) print(j) preds_sent = np.zeros((1, hp.maxlen, hp.beam_width)) probs_sent = np.zeros_like(preds_sent, np.float64) #probs_ref = np.zeros_like(preds_sent, np.float64) x_a = x[j:j + 1, :] #input one sentence each time sent_len = x_a[0, :].tolist().index(0) #print(x_a) preds = np.zeros((1, hp.maxlen), np.int32) preds_prob = np.zeros_like(preds, np.float64) _logits = np.array( sess.run(g.logits, { g.x: x_a, g.y: preds })) sent_j = _logits[0, 0] #print(sent_j) sos = sent_j.argsort( )[-1:] #retrieve the token of first character (Start of sentence) preds[ 0, 0] = sos #settle the sos token at the beginning of preds sos_prob = sent_j[sos] preds_prob[0, 0] = sos_prob #print(preds[0,0]) for bw in range(hp.beam_width): preds_sent[0, 0, bw] = preds[0, 0] probs_sent[0, 0, bw] = preds_prob[0, 0] #print(probs_sent) _logits = np.array( sess.run(g.logits, { g.x: x_a, g.y: preds })) sent_j = _logits[0] word_1 = sent_j[1] word_1 = word_1 + preds_prob[0, 0] top_bw_idx = word_1.argsort()[-hp.beam_width:] #print(top_bw_idx) top_bw_probs = word_1[top_bw_idx] #print(top_bw_probs) for bw in range(hp.beam_width): preds_sent[0, 1, bw] = np.copy(top_bw_idx[bw]) #print(top_bw_probs[bw]) probs_sent[0, 1, bw] = top_bw_probs[bw] #print(probs_sent) #settle top_bw tokens for the second character (first word) #print(probs_sent) for k in range( 2, hp.maxlen): #this part need special design added_probs = [] paths_candidate = [] preds_prob_list = [] for bw in range(hp.beam_width): preds[0, :] = preds_sent[0, :, bw].copy() preds_prob[0, :] = probs_sent[0, :, bw].copy() #print(preds_prob) if (preds_sent[0, k - 1, bw] == 3): preds_sent[0, k, bw] = 3 current_path = preds_sent[0, :, bw] new_path = np.copy(current_path) new_path[k] = 3 paths_candidate.append(new_path) preds_prob[0, k] = 0 current_preds_prob = np.copy(preds_prob) print(current_preds_prob) added_probs = np.concatenate( (added_probs, [np.sum(current_preds_prob[0])]), 0) preds_prob_list.append(current_preds_prob) if (preds_sent[0, k - 1, bw] != 3): current_path = preds_sent[0, :, bw] _logits = np.array( sess.run(g.logits, { g.x: x_a, g.y: preds })) sent_j = _logits[0] word_k = sent_j[ k] #+np.sum(preds_prob[0]) #log(a*b) = log a + log b top_bw_idx = word_k.argsort( )[-hp.beam_width:] top_bw_probs = sent_j[k][top_bw_idx] for bmw in range(hp.beam_width): new_path = np.copy(current_path) new_path[k] = top_bw_idx[bmw] current_step_probs = top_bw_probs[bmw] current_path_probs = np.copy( preds_prob[0]) current_path_probs[ k] = current_step_probs added_probs = np.concatenate( (added_probs, [np.sum(current_path_probs)]), 0) #print(new_path) paths_candidate.append(new_path) preds_prob_list.append( current_path_probs) #print("what hell is going on") #print(sub_candidates) #print("this is a =========") a_idx = np.array( added_probs).argsort()[-hp.beam_width:] a_prob = added_probs[a_idx] #print(a_prob) print(preds_prob_list) for bw in range(hp.beam_width): preds_sent[0, :, bw] = np.copy( paths_candidate[a_idx[bw]]) #print(paths_candidate[a_idx[bw]]) #print(preds_sent[0, :, bw]) probs_sent[0, :, bw] = np.copy(preds_prob_list[bw]) print(probs_sent) #print("probs_sent:") #print(probs_sent) predx[j, :, :] = preds_sent predx_prob[j, :, :] = probs_sent #print("checkpoint") #sys.exit() ### Write to file print("done") for source, target, pred, prob in zip( sources, targets, predx, predx_prob): # sentence-wise candits = [] candits_probs = [] for i in range(hp.beam_width): pres = pred[:, i] pros = prob[:, i] got = "".join( idx2en[idx] for idx in pres).split("</S>")[0].strip() candits.append(got) candits_probs.append(pros) fout.write("- source: " + source + "\n") fout.write("- expected: " + target + "\n") print(candits) for i in range(len(candits)): fout.write("- got: " + candits[i] + "\n") m = len(candits[i]) fout.write(' '.join( str(each) for each in candits_probs[i].tolist() [:m - 2])) #each for each in fout.write("\n") fout.write("\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis)
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: # x: (32,10) y:(32,10) 一个batch32个句子,每个句子长度为10 self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) """ 定义decoder部分的input 假设真实翻译后的输出为 i am a student </S> decoder部分的input应为: <S> i am a student """ self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2代表<S>,是decoder的初始输入 # 词典 de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad=True, # 让padding一直是0 scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ##Drop out self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") ## Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection # 对最后一维做线性变换成词库这么长,对应每个单词的logits,然后将logits最大的索引记录下来,即预测值 self.logits = tf.layers.dense(self.dec, len(en2idx)) #(N, T, vocab_len) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) # (N, T) # 把y中所有不是<PAD>出来的都由True转化为1.0 self.istarget = tf.to_float(tf.not_equal(self.y, 0)) # acc表示的是 (一个batch中所有的非<PAD>的单词,预测对的数量求和)/(一个batch中所有的非<PAD>单词数量) # tips:tf.reduce_sum()未指定axis,即把所有维度都加起来 self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / (tf.reduce_sum(self.istarget)) # 计算acc给summary监督学习过程。 tf.summary.scalar('acc', self.acc) if is_training: # Loss # tf.one_hot(tensor, int),构造一个len(tensor)*int的tensor,tensor的值变成索引,对应位置为1.,其他为0. # 如果索引值大于int大小,则整行都是0. self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len( en2idx))) #y_smoothed因为one_hot变成了(N, T, vocab_len) # tf.nn.softmax_cross_entropy_with_logits实际上做的事情是: # 1.先对logits求softmax 2.再将vocab_len上的分布和y_label做交叉熵,得到一个(N, T)的向量 # 即每一单词有一个loss self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) # (N, T) # 将<PAD>出来的部分的loss去掉,再求mean_loss self.mean_loss = tf.reduce_sum(self.loss * self.istarget) / ( tf.reduce_sum(self.istarget)) #标量scale # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def eval(hp): # Load graph g = Graph(hp=hp, is_training=False) print("Graph loaded") # Load data X, X_image, X_length, Y, Sources, Targets, X_turn_number, SRC_emotion, TGT_emotion, Speakers, A = load_test_data( hp) #print(X) de2idx, idx2de = load_de_vocab(hp) en2idx, idx2en = load_en_vocab(hp) # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto( allow_soft_placement=True)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name #fftmp=open("tmp.txt","w") ## Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/" + mname, "w", "utf-8") as fout: list_of_refs, hypotheses, test_loss = [], [], [] for i in range(len(X) // hp.batch_size): ### Get mini-batches x = X[i * hp.batch_size:(i + 1) * hp.batch_size] x_length = X_length[i * hp.batch_size:(i + 1) * hp.batch_size] y = Y[i * hp.batch_size:(i + 1) * hp.batch_size] x_emotion = SRC_emotion[i * hp.batch_size:(i + 1) * hp.batch_size] speaker = Speakers[i * hp.batch_size:(i + 1) * hp.batch_size] x_image = X_image[i * hp.batch_size:(i + 1) * hp.batch_size] a = A[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] eval_bath = sess.run( g.mean_loss, { g.x: x, g.x_image: x_image, g.x_length: x_length, g.y: y, g.x_emotion: x_emotion, g.speaker: speaker, g.A: a, g.x_turn_number: x_turn_number }) test_loss.append(eval_bath) ### Autoregressive inference preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) for j in range(hp.maxlen): _preds = sess.run(g.preds, { g.x: x, g.x_length: x_length, g.y: preds }) preds[:, j] = _preds[:, j] ### Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join( idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source + "\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score #ref = target.split() ref = target.split(u"</d>")[1].split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) ## Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Test Bleu Score = " + str(100 * score)) print("Test Bleu Score = " + str(100 * score)) print("eval PPL = %.5lf" % (round(math.exp(np.mean(test_loss)), 4))) print("eval loss = %.5lf" % (np.mean(test_loss))) # Distinct-1, Distinct-2 candidates = [] for line in hypotheses: candidates.extend(line) distinct_1, distinct_2 = cal_Distinct(candidates) print('Distinct-1:' + str(round(distinct_1, 4)) + 'Distinct-2:' + str(round(distinct_2, 4)))
trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all() if __name__ == '__main__': # Load vocabulary de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Construct graph g = Graph("train") print("Graph loaded") # Start session sv = tf.train.Supervisor(graph=g.graph, logdir=hp.logdir, save_model_secs=0) with sv.managed_session() as sess: for epoch in range(1, hp.num_epochs + 1): if sv.should_stop(): break for step in tqdm(range(g.num_batch), total=g.num_batch,
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # X, Sources, Targets = X[:33], Sources[:33], Targets[:33] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) # Start session with g.graph.as_default(): sv = tf.train.Supervisor() with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name ## Inference totalTransNum = 0 if not os.path.exists('results'): os.mkdir('results') with codecs.open('results/'+mname+'.trans', 'w', 'utf8') as tfout: with codecs.open("results/" + mname, "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] for i in range((len(X) // hp.batch_size) + 1): ### Get mini-batches batchEnd = (i+1)*hp.batch_size readlBatchSize = hp.batch_size if batchEnd > len(X): readlBatchSize = hp.batch_size - (batchEnd - len(X)) batchEnd = len(X) x = X[i*hp.batch_size: batchEnd] sources = Sources[i*hp.batch_size: batchEnd] targets = Targets[i*hp.batch_size: batchEnd] totalTransNum += len(sources) ### Autoregressive inference preds = np.zeros((readlBatchSize, hp.maxlen), np.int32) for j in range(hp.maxlen): _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] ### Write to file for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join(idx2en[idx] for idx in pred).split("</S>")[0].strip() fout.write("- source: " + source +"\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") tfout.write(got) tfout.write('\n') # bleu score ref = target.split() hypothesis = got.split() if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) ## Calculate bleu score score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100*score)) fout.write('\n') print('totalTransNum', totalTransNum, 'Bleu', str(100*score))
def __init__(self, is_training): self.is_training = is_training self.graph = tf.Graph() with self.graph.as_default(): self._selector = True self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.x_target = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.image = tf.placeholder(tf.float32, shape=[None, 196, 1024]) self.dropout_rate = tf.placeholder(tf.float32) self.lstm_drop_rate = tf.placeholder(tf.float32) self.lr = tf.placeholder(tf.float32, shape=[]) batch_size = tf.shape(self.image)[0] de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() self.batch_size = batch_size self.en2idx = en2idx self.de2idx = de2idx self.weight_initializer = tf.contrib.layers.xavier_initializer() self.istarget = tf.to_float(tf.not_equal(self.x_target, 0)) with tf.variable_scope("en_caption"): with tf.variable_scope("embedding"): self.lookup_table = tf.get_variable( 'lookup_table', dtype=tf.float32, shape=[len(self.en2idx), hp.hidden_units_cap], initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0)) with tf.variable_scope("lstm"): lstm_cell = tf.nn.rnn_cell.LSTMCell(hp.lstm_units) lstm = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, input_keep_prob=1.0 - self.lstm_drop_rate, output_keep_prob=1.0 - self.lstm_drop_rate) self.lstm = lstm self.feature = tf.contrib.layers.batch_norm( inputs=self.image, decay=0.95, center=True, scale=True, updates_collections=None, is_training=False) #self.is_training with tf.variable_scope("initialize"): context_mean = tf.reduce_mean(self.feature, axis=1) initial_memory, initial_output = self.initial(context_mean) initial_state = initial_memory, initial_output last_state = initial_state last_output = initial_output self.last_state, self.last_output = initial_state, initial_output logit_list, self.preds_list, alpha_list = [], [], [] sentence = tf.nn.embedding_lookup( self.lookup_table, tf.ones(batch_size, dtype=tf.int32) * 2) if not is_training: beam_width = 5 self.feature = tf.tile( tf.expand_dims(self.feature, axis=1), [1, beam_width, 1, 1]) self.preds = self.beam_search(sentence, beam_width=beam_width, num_classes=len(en2idx)) else: for i in range(hp.maxlen): #batch_size x embed_dim alpha = self.attention(last_output) #batch_size x 196 mask_alpha = tf.tile( tf.expand_dims(self.istarget[:, i], 1), [1, 196]) alpha_list.append(alpha * mask_alpha) image_attention = tf.reduce_sum( self.feature * tf.expand_dims(alpha, 2), axis=1) #batch_size x 1024 if self._selector: image_attention = self.selector( image_attention, last_output) inputs = tf.concat((image_attention, sentence), axis=1) output, state = lstm(inputs, last_state) #!! temp = tf.layers.dropout(output, rate=self.dropout_rate) expanded_output = tf.concat( [temp, sentence, image_attention], axis=1) logits = self.decode(expanded_output) prediction = tf.argmax(logits, 1) self.preds_list.append(prediction) logit_list.append(logits) sentence = tf.nn.embedding_lookup( self.lookup_table, self.x[:, i]) last_state = state last_output = output if is_training: self.preds_list = tf.stack(self.preds_list, axis=1) logits = tf.stack(logit_list, axis=1) alpha_list = tf.stack(alpha_list, axis=1) attentions = tf.reduce_sum(alpha_list, axis=1) diffs = tf.ones_like(attentions) - attentions attention_loss = hp.attention_loss_factor \ * tf.nn.l2_loss(diffs) \ / tf.cast((batch_size * 196),dtype=tf.float32) self.loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.one_hot(self.x_target, len(en2idx)), logits=logits) self.loss = tf.reduce_sum( self.loss * self.istarget) / tf.reduce_sum( self.istarget) + attention_loss self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr, beta1=0.9, beta2=0.98, epsilon=1e-9) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.train_op = self.optimizer.minimize( self.loss, global_step=self.global_step) self.train_op = tf.contrib.layers.optimize_loss( loss=self.loss, global_step=self.global_step, learning_rate=self.lr, optimizer=self.optimizer, clip_gradients=hp.clip_gradients) self.value_list = slim.get_variables_to_restore()
def build_network(self): #import ipdb; ipdb.set_trace() config = self.config de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, len(de2idx), num_units=config.hidden_dim, scale=True, scope='enc_embed') ## plus position embedding self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), \ [tf.shape(self.x)[0], 1]), config.maxlen, config.hidden_dim, zero_pad=False, scale=False, scope="enc_pe") self.enc = dropout(self.enc, config.keep_rate, is_train=self.is_train) self.enc_ = self.enc for block_idx in range(config.num_enc_block_1): scope = "encoder_block_{}".format(block_idx) enc_out = conv2d(self.enc, kernel_shape=(config.enc_kernel_width, 1), scope=scope) enc_out = batch_norm(enc_out, is_training=self.is_train, scope="lm" + scope) self.enc = enc_out # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decode_input, len(en2idx), config.hidden_dim, scale=True, scope='dec_embed') ## plus position embedding self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decode_input)[1]), 0), \ [tf.shape(self.decode_input)[0], 1]), config.maxlen, config.hidden_dim, zero_pad=False, scale=False, scope='dec_pe') self.dec_ = self.dec for block_idx in range(config.num_dec_block_1): scope = "decoder_block_conv_{}".format(block_idx) attention_scope = "decoder_block_att_{}".format(block_idx) dec_out = conv2d(self.dec, kernel_shape=(config.dec_kernel_width, 1), causal=True, scope=scope) dec_out = attention_pool(self.enc_, self.dec, enc_out, dec_out, scope=attention_scope) dec_out = dec_out + self.dec dec_out = batch_norm(dec_out, is_training=self.is_train, scope="lm" + scope) self.dec = dec_out with tf.variable_scope('encoder'): for block_idx in range(config.num_enc_block_2): scope = "encoder_block_{}".format(config.num_enc_block_1 + block_idx) enc_out = conv2d(self.enc, kernel_shape=(config.enc_kernel_width, 1), num_outputs=config.hidden_dim_2, scope=scope) enc_out = batch_norm(enc_out, is_training=self.is_train, scope="lm" + scope) self.enc = enc_out with tf.variable_scope('decoder'): for block_idx in range(config.num_dec_block_2): scope = "decoder_block_conv_{}".format(config.num_dec_block_1 + block_idx) attention_scope = "decoder_block_att_{}".format( config.num_dec_block_1 + block_idx) dec_out = conv2d(self.dec, kernel_shape=(config.dec_kernel_width, 1), num_outputs=config.hidden_dim_2, causal=True, scope=scope) dec_out = attention_pool(self.enc_, self.dec, enc_out, dec_out, scope=attention_scope) dec_out = dec_out + self.dec dec_out = batch_norm(dec_out, is_training=self.is_train, scope="lm" + scope) self.dec = dec_out with tf.variable_scope("softmax_layer"): w = tf.get_variable('w', [config.hidden_dim, len(en2idx)]) b = tf.get_variable('b', [len(en2idx)]) w = tf.tile(tf.expand_dims(w, 0), [config.batch_size, 1, 1]) self.logits = tf.matmul(dec_out, w) + b self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / tf.reduce_sum(self.istarget) tf.summary.scalar('acc', self.acc) if self.is_train: self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_mean(self.loss) tf.summary.scalar('mean_loss', self.mean_loss) self.tensors = { 'source_sentence': self.enc_, 'target_sentence': self.dec_, 'enc_out': enc_out, 'dec_out': dec_out, 'predictions': self.preds, 'logits': self.logits } if self.is_train: self.tensors['loss'] = self.loss for key, value in self.tensors.items(): tf.summary.histogram(key, value)
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs self.decoder_inputs = tf.concat((tf.ones_like(self.y[:, :1])*2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention(queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward(self.enc, num_units=[4*hp.hidden_units, hp.hidden_units]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention(queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention(queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward(self.dec, num_units=[4*hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds, self.y))*self.istarget)/ (tf.reduce_sum(self.istarget)) tf.summary.scalar('acc', self.acc) if is_training: # Loss self.y_smoothed = label_smoothing(tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum(self.loss*self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize(self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() # (N, T) else: # inference self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Load vocabulary de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, scale=True, scope="enc_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") self.enc *= key_masks ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") self.dec *= key_masks ## Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / (tf.reduce_sum(self.istarget)) tf.summary.scalar('acc', self.acc) if is_training: # Loss self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) # Training Scheme self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) # Summary tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def eval(): # Load graph g = Graph(is_training=False) print("Graph loaded") # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() # X, Sources, Targets = X[:33], Sources[:33], Targets[:33] # Start session with g.graph.as_default(): sv = tf.train.Supervisor() gpu_options = tf.GPUOptions(allow_growth=True) with sv.managed_session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: ## Restore parameters sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)) print("Restored!") ## Get model name mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name ## Inference if not os.path.exists('results'): os.mkdir('results') with codecs.open("results/" + mname, "w", "utf-8") as fout: list_of_refs, hypotheses = [], [] for i in range(len(X) // hp.batch_size): ### Get mini-batches 切片得到batch x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] ### Autoregressive inference preds = np.zeros((hp.batch_size, hp.maxlen), np.int32) for j in range(hp.maxlen): # 通过网络预测g.preds,feed_dict的g.y,是之前定义的全为0的preds # 每次预测batch中所有句子的一个单词 # 因为multi-attention有各种mask存在,所以当预测y的第i个单词时,self-attentuon不会受后面单词的影响(seq-mask) # 同时decoder-encoder-attention不会受0 <PAD>标记影响(query-mask) # 所以可以一个一个单词训练。 _preds = sess.run(g.preds, {g.x: x, g.y: preds}) preds[:, j] = _preds[:, j] ### Write to file # 通过zip把batch中的一个句子的source, target, pred取出来 for source, target, pred in zip(sources, targets, preds): # sentence-wise # " ".join获得整个句子,在</S>前的留下 got = " ".join( idx2en[idx] for idx in pred).split("</S>")[0].strip() print(got) fout.write("- source: " + source + "\n") fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush() # bleu score ref = target.split() hypothesis = got.split() #总长小于3的句子不计算bleu,因为bleu对短的句子得分很高。 if len(ref) > 3 and len(hypothesis) > 3: list_of_refs.append([ref]) hypotheses.append(hypothesis) ## Calculate bleu score # list_of_refs的形状为 所有长度大于3的句子长度 * 1 * 该句句子长度 # 没有batch的信息,因为batch只是一个训练参数 score = corpus_bleu(list_of_refs, hypotheses) fout.write("Bleu Score = " + str(100 * score))