def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def loss_function(self, inputs): logits, label = inputs istarget = tf.to_float(tf.not_equal(label, 0)) y_smoothed = label_smoothing(tf.one_hot(label, depth=self.out_vocab_len)) loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_smoothed) mean_loss = tf.reduce_sum(loss * istarget) / (tf.reduce_sum(istarget)) return mean_loss
def train(self, xs, ys): # forward loss_weight = ys[-1] ys = ys[:-1] memory, sents1 = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"])) # 0: <pad> a = ce * nonpadding print ('loss_weight1.shape', loss_weight.shape) print ('a.shape', a.shape) a = ce * nonpadding * (1 + loss_weight) b = nonpadding loss = tf.reduce_sum(a) / (tf.reduce_sum(b) + 1e-7) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(preds, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(learning_rate=lr,beta1=0.9, beta2=0.997, epsilon=1e-9) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("accuracy", accuracy) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def init(self): with self.graph.as_default(): self.build_model() self.istarget = tf.to_float(tf.not_equal(self.y, 0)) norm_len = tf.reduce_sum(self.istarget) equal_val = tf.to_float(tf.equal(self.preds, self.y)) self.acc = tf.reduce_sum(equal_val*self.istarget)/ norm_len tf.summary.scalar('acc', self.acc) tf.summary.scalar('target_norm_len', norm_len) if self.is_training: # Loss self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(self.en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.logits, labels=self.y_smoothed) a = tf.reduce_sum(self.loss * self.istarget) self.loss = a / (tf.reduce_sum(self.istarget)) # Training Scheme self.get_train_op() # Summary tf.summary.scalar('mean_loss', self.loss) self.merged = tf.summary.merge_all()
def train(self, xs, decode_inputs, y): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, src_masks, outputs, scopes = self.encode(xs) dec, outputs1, scopes1 = self.decode(decode_inputs, memory, src_masks) # Final linear projection (embedding weights are shared) outputs = outputs + outputs1 scopes = scopes + scopes1 with tf.variable_scope("logits", reuse=tf.AUTO_REUSE): weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) #nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"])) # 0: <pad> #loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) scopes.append(tf.get_variable_scope().name) outputs.append(loss) return loss, outputs, scopes
def train(self, xs, ys): # Forward memory, sents1 = self.encode(xs) logits, preds, y, sent2 = self.decode(ys, memory) # Train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal(y, self.token2idx['<PAD>'])) loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar('loss', loss) tf.summary.scalar('global_step', global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) if self.hp.io_tie and self.hp.ortho_embedding: lmb = self.hp.ortho_lambda normlevel = self.hp.ortho_reg_norm if not self.hp.fac_embed: real_embedding = self.embeddings[1:, :] if not (self.hp.norm_embedding or self.embedding_normalization): loss = loss + (tf.norm(tf.subtract( tf.matmul(tf.transpose(real_embedding), real_embedding), tf.scalar_mul(tf.constant(2.0, dtype=tf.float32), tf.eye(self.hp.d_model))), ord=normlevel)**2) * lmb else: wtw = tf.matmul(tf.transpose(real_embedding), real_embedding) wtw_diag = tf.linalg.diag(tf.linalg.diag_part(wtw)) loss = loss + (tf.norm(tf.subtract(wtw, wtw_diag))** 2) * lmb else: loss = loss + (tf.norm(tf.subtract( tf.matmul(self.embeddings2, tf.transpose( self.embeddings2)), tf.eye(self.hp.d_embed)), ord=normlevel)**2) * lmb #loss=loss+tf.norm(tf.subtract( tf.matmul( tf.transpose(self.embeddings1), self.embeddings1),tf.eye(self.hp.d_embed) ) ,ord=normlevel)*lmb#if not good, delete this global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys, x_paraphrased_dict, synonym_label=None): # forward memory, sents1 = self.encode(xs) _, _, synonym_label_loss = self.labeling(synonym_label, memory) logits, preds, y, sents2 = self.decode(ys, x_paraphrased_dict, memory) # train scheme # generation loss y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"])) # 0: <pad> loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) # multi task loss tloss = self.hp.l_alpha * loss + (1.0-self.hp.l_alpha) * synonym_label_loss global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(tloss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("tloss", tloss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def build_model(self, inputs,labels=None): batch_size = tf.shape(inputs)[0] # forward memory = self.encode(inputs,self.train_mode) if self.train_mode or labels is not None: decoder_inputs = labels[:,:-1] decoder_targets = labels[:,1:] logits, self.preds = self.decode(decoder_inputs, memory,self.train_mode) # self.preds <--- argmax 취한 값 # train scheme if self.hp.label_smoothing: y_ = label_smoothing(tf.one_hot(decoder_targets, depth=self.hp.VOCAB_SIZE)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) # lables(one-hot) nonpadding = tf.to_float(tf.not_equal(decoder_targets, self.token2idx["<pad>"])) # 0: <pad> self.loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) else: weights = tf.ones(shape=[batch_size,self.hp.OUTPUT_LENGTH]) self.loss = tf.contrib.seq2seq.sequence_loss(logits=logits, targets=decoder_targets, weights=weights) # targets(not one-hot) else: init_decoder_inputs = tf.ones((batch_size, 1), tf.int32) * self.token2idx["<sos>"] decoder_inputs = init_decoder_inputs for _ in range(self.hp.OUTPUT_LENGTH): logits, y_hat, = self.decode(decoder_inputs, memory, training=False) decoder_inputs = tf.concat([init_decoder_inputs,y_hat],axis=-1) # 생성된 것의 마지막만 살리는냐? 전체를 살리느냐? self.preds = y_hat
def __init__(self, hp): self.hp = hp self.token2idx, self.idx2token = load_vocab(hp.vocab) self.embeddings = get_token_embeddings(self.hp.vocab_size, self.hp.d_model, zero_pad=True) self.input_x = tf.placeholder(dtype=tf.int32, shape=(None, None), name="input_x") self.decoder_input = tf.placeholder(dtype=tf.int32, shape=(None, None), name="decoder_input") self.target = tf.placeholder(dtype=tf.int32, shape=(None, None), name="target") self.is_training = tf.placeholder(dtype=tf.bool, name="is_training") # encoder self.encoder_hidden = self.encode(self.input_x, training=self.is_training) # decoder self.logits = self.decode(self.decoder_input, self.encoder_hidden, training=self.is_training) self.y_hat = tf.to_int32(tf.argmax(self.logits, axis=-1), name="y_predict_v2") # loss self.smoothing_y = label_smoothing( tf.one_hot(self.target, depth=self.hp.vocab_size)) self.ce_loss = tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.logits, labels=self.smoothing_y) nonpadding = tf.to_float( tf.not_equal(self.target, self.token2idx["<pad>"])) self.loss = tf.reduce_sum( self.ce_loss * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) # optimize self.global_step = tf.train.get_or_create_global_step() self.lr = noam_scheme(self.hp.lr, self.global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.minimize(self.loss, global_step=self.global_step) # tensorboard tf.summary.scalar('lr', self.lr) tf.summary.scalar("loss", self.loss) tf.summary.scalar("global_step", self.global_step) self.summaries = tf.summary.merge_all() # predict part self.y_predict = tf.identity(self.greedy_search(), name="y_predict")
def _loss_op(self, l2_lambda=0.0001): with tf.name_scope('cost'): self.y_smoothed = label_smoothing(self.y) losses = tf.nn.sigmoid_cross_entropy_with_logits( labels=self.y_smoothed, logits=self.logits) loss = tf.reduce_mean(losses, name='loss_val') weights = [ v for v in tf.trainable_variables() if ('w' in v.name) or ('kernel' in v.name) ] l2_loss = tf.add_n([tf.nn.l2_loss(w) for w in weights]) * l2_lambda loss += l2_loss return loss
def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, sents1 = self.encode(xs) # memory = tf.Print(memory, [memory], message='memory =', summarize=10) logits, preds, y, sents2 = self.decode(ys, memory) # logits = tf.Print(logits, [logits], message='logits =', summarize=10) print('train logits.shape, y.shape =', logits.shape, y.shape) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) # logits = tf.Print(logits, [logits], message='logits =', summarize=10) # y_ = tf.Print(y_, [y_], message='y_ =', summarize=10) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> # nonpadding = tf.Print(nonpadding, [nonpadding], message='nonpadding =', # summarize=100) # nonpadding_print = tf.print('nonpadding =', tf.shape(nonpadding) # , summarize=20) # with tf.control_dependencies([nonpadding_print]): loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) # loss = tf.Print(loss, [loss], message='loss =', summarize=10) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) # gradients = optimizer.compute_gradients(loss) # # print_grad = tf.print('gradients =', gradients, summarize=10) # # with tf.control_dependencies([print_grad]): # clip_grads = [(tf.clip_by_value(grad, -100., 100.), var) for grad, var in gradients] # train_op = optimizer.apply_gradients(clip_grads, global_step=global_step) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme # y:(N, T2)值:[[ 5768 7128 7492 7128 7492 4501 7128 7128 14651],[ 5768 7128 7492 7128 7492 4501 7128 7128 14651]] # y_:(N, T2, vocab_size); 值:(N, T2,[0,0.999,0,.....,0] y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) # 预测值和label做交叉熵,生成损失值 # logits:预测id的概率 (N, T2,[0,0.999,0,.....,0]) # ce: (N,T2) 例如:(4, 42) 值:array([[ 6.8254533, 6.601975 , 6.5515084...,9.603574 , 10.001306 ],[6.8502007, 6.645137...]】,每个字粒度的损失 ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) # nonpadding:(N,T2) 例如:(4, 42) 值:[[[1., 1., 1.,0,0,0],[1., 1., 1., 1., 1., 1.,.....] nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> # tf.reduce_sum 按照某一维度求和 不指定axis,默认所有维度 # ce * nonpadding:只求没有填充的词的损失,padding的去掉了 tf.reduce_sum(nonpadding):个数相加为了求平均 loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) global_step = tf.train.get_or_create_global_step() # 根据训练步数,动态改变学习率 lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) #定义优化器 optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): # 用于训练模型 ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward # 调用decode()和encode()来获取个部分的输出结果 memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme # 利用one_hot表示每个词的索引在整个词表中的位置, 相当于构建出了要训练的目标Label, # 这里就是要使logits的最终结果,即vocab_size大小的向量中,目标词汇所在位置(索引)的值尽可能的大,而使其他位置的值尽可能的小。 # 构造出了输出和标签之后,就使用tf.nn.softmax_cross_entropy_with_logits()进行训练 y_ = label_smoothing(tf.one_hot( y, depth=self.hp.vocab_size)) # label_smoothing函数用来进行one hot函数的平滑处理 ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) # 在计算Loss之前,还要进行一定的处理。由于一开始对有些句子长度不够maxlen的进行了padding,所以在计算Loss的时候,将这些位置上的误差清0 nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> # loss函数用到了交叉熵函数,但是在计算的时候去掉了padding的影响。 loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) #对学习率进行调整,用到了warmup操作,初始阶段lr逐渐上升,迭代后期则逐渐下降 global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) # 最后即是利用了AdadeltaOptimizer优化器对loss进行优化。 # tf.summary.scalar()函数即是以key - value的形式保存数值,可以用于TensorBoard中对数据的可视化展示。 tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def train(self, xs, ys): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # 构建encoder和decoder memory, sents1 = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory) # train scheme y_ = label_smoothing(tf.one_hot( y, depth=self.hp.vocab_size)) # batch_size*T*vocab_size ce = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=y_) # logits 未经过softmax处理,因为函数内部进行了处理, nonpadding = tf.to_float(tf.not_equal(y, self.token2idx["<pad>"]) ) # 0: <pad> 相当于mask,不等长序列最后计算loss要剔除padding项 # 计算整个batch内的平均loss,1e-7防止分母为0的情况发生 # ce * nonpadding 只计算非padding的 loss # 分母为 tf.reduce_sum(nonpadding) 表示计算整个batch的平均loss loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) train_op = optimizer.minimize(loss, global_step=global_step) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def batch_split_train(self, xs, ys, split_num=4): ''' Returns loss: scalar. train_op: training operation global_step: scalar. summaries: training summary node ''' # forward #xs_split=tf.split(xs,gpu_num) #ys_split=tf.split(ys,gpu_num) #print(xs) #print(ys) #xs_split=[] #ys_split=[] #batchsize=self.hp.batch_size ''' divided_batch_size=batchsize//split_num for i in range(split_num): start=divided_batch_size*i end=start+divided_batch_size xs_split.append((xs[0][start:end],xs[1][start:end],xs[2][start:end])) ys_split.append((ys[0][start:end],ys[1][start:end],ys[2][start:end],ys[3][start:end])) ''' global_step = tf.train.get_or_create_global_step() lr = noam_scheme(self.hp.lr, global_step // split_num, self.hp.warmup_steps) optimizer = tf.train.AdamOptimizer(lr) #models=[] #for i in range(split_num): memory, sents1, src_masks = self.encode(xs) logits, preds, y, sents2 = self.decode(ys, memory, src_masks) # train scheme y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) if self.hp.io_tie and self.hp.ortho_embedding: lmb = self.hp.ortho_lambda normlevel = self.hp.ortho_reg_norm if not self.hp.fac_embed: real_embedding = self.embeddings[1:, :] if not (self.hp.norm_embedding or self.embedding_normalization): loss = loss + (tf.norm(tf.subtract( tf.matmul(tf.transpose(real_embedding), real_embedding), tf.scalar_mul(tf.constant(2.0, dtype=tf.float32), tf.eye(self.hp.d_model))), ord=normlevel)**2) * lmb else: wtw = tf.matmul(tf.transpose(real_embedding), real_embedding) wtw_diag = tf.linalg.diag(tf.linalg.diag_part(wtw)) loss = loss + (tf.norm(tf.subtract(wtw, wtw_diag))** 2) * lmb else: loss = loss + (tf.norm(tf.subtract( tf.matmul(self.embeddings2, tf.transpose( self.embeddings2)), tf.eye(self.hp.d_embed)), ord=normlevel)**2) * lmb #loss=loss+tf.norm(tf.subtract( tf.matmul( tf.transpose(self.embeddings1), self.embeddings1),tf.eye(self.hp.d_embed) ) ,ord=normlevel)*lmb#if not good, delete this grads = optimizer.compute_gradients(loss) self.steps.append((loss, grads)) if len(self.steps) == split_num: tower_losses, tower_grads = zip(*self.steps) train_op = optimizer.apply_gradients( average_gradients(tower_grads), global_step=global_step) self.steps = [] else: train_op = optimizer.apply_gradients([], global_step=global_step) #aver_loss=tf.reduce_mean(tower_losses) tf.summary.scalar('lr', lr) tf.summary.scalar("loss", loss) tf.summary.scalar("global_step", global_step) summaries = tf.summary.merge_all() return loss, train_op, global_step, summaries
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为 # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分 # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。 # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果 self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad= True, # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0) scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks, 叠加block,6个 for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") # Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) # Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection, 分类任务,分类数量是词表长度 self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, training=True): self.graph = tf.Graph() with self.graph.as_default(): if training: self.x, self.y, self.num_batch = get_batch() else: self.x = tf.placeholder(tf.int32, shape=(None, hp.max_len)) self.y = tf.placeholder(tf.int32, shape=(None, hp.max_len)) self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) de2idx, idx2de = load_data.load_vocab( './preprocessed/de.vocab.tsv') en2idx, idx2en = load_data.load_vocab( './preprocessed/en.vocab.tsv') self.embedding = get_token_embeddings(len(de2idx), hp.hidden_units, zero_pad=True) with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): self.enc = tf.nn.embedding_lookup(self.embedding, self.x) # scale self.enc *= hp.hidden_units**0.5 # positional encoding self.enc += positional_encoding(self.enc) self.enc = tf.layers.dropout(self.enc, hp.dropout_rate, training=training) for i in range(hp.num_blocks): with tf.variable_scope('num_blocks_{}'.format(i), reuse=tf.AUTO_REUSE): self.enc = multihead_attention( queries=self.enc, keys=self.enc, values=self.enc, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, training=training, causality=False) self.enc = ff(self.enc, num_units=[hp.d_ff, hp.hidden_units]) with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): self.dec = tf.nn.embedding_lookup(self.embedding, self.decoder_inputs) self.dec *= hp.hidden_units**0.5 self.dec += positional_encoding(self.dec) self.dec = tf.layers.dropout(self.dec, hp.dropout_rate, training=training) for i in range(hp.num_blocks): with tf.variable_scope('num_block_{}'.format(i), reuse=tf.AUTO_REUSE): self.dec = multihead_attention( queries=self.dec, keys=self.dec, values=self.dec, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, training=training, causality=True, scope='self_attention') self.dec = multihead_attention( queries=self.dec, keys=self.enc, values=self.enc, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, training=training, causality=False, scope='vanilla_attention') self.dec = ff(self.dec, num_units=[hp.d_ff, hp.hidden_units]) self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.arg_max(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget) / (tf.reduce_sum(self.istarget)) tf.summary.scalar('acc', self.acc) if training: self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / tf.reduce_sum(self.istarget) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def build(self): position_encoding_outputs = modules.position_encoding( self.x_input, args.position_size) if args.position_encoding_type == 'add': outputs = position_encoding_outputs + self.x_input if args.position_encoding_type == "concat": outputs = tf.concat([self.x_input, position_encoding_outputs], axis=2) for i in range(6): sublayer1 = modules.multi_head_attention( outputs, outputs, outputs, args.head_num, args.head_size, self.dropout, self.training, type=args.attention_unit_type) self.mhas.append(sublayer1) outputs = modules.residual_connection(outputs, sublayer1, self.training) sublayer2 = modules.feed_forward(outputs, args.feed_forward_size, self.dropout, self.training) outputs = modules.residual_connection(outputs, sublayer2, self.training) outputs = tf.layers.dense(outputs, 1, use_bias=True, name='last_output') outputs = tf.squeeze(outputs, -1) # (batch_size, seqlen) outputs = tf.layers.dense(outputs, args.nlabel, name='output_logit') self.logits = outputs self.logits_softmax = tf.nn.softmax(outputs, name='output_logit_softmax') if self.training is not None: util.params_usage(tf.trainable_variables()) y = tf.one_hot(self.y_true, args.nlabel) self.y_smooth = modules.label_smoothing(y) loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.y_smooth, logits=self.logits) self.loss = tf.reduce_mean(loss) self.global_step = tf.train.get_or_create_global_step() self.lr = modules.noam_scheme(args.eta, global_step=self.global_step, warmup_steps=args.warmup) train_op = tf.compat.v1.train.AdamOptimizer(self.lr).minimize( self.loss, global_step=self.global_step) update_ops = tf.compat.v1.get_collection( tf.compat.v1.GraphKeys.UPDATE_OPS) self.train_op = tf.group([train_op, update_ops]) self.y_pred = tf.argmax(self.logits_softmax, axis=1, name="y_pred") pred_prob = tf.equal(tf.cast(self.y_pred, tf.int32), self.y_true) self.accuracy = tf.reduce_mean(tf.cast(pred_prob, tf.float32), name="accuracy") tf.compat.v1.summary.scalar('accuracy', self.accuracy) tf.compat.v1.summary.scalar('loss', self.loss) tf.compat.v1.summary.scalar('learning rate', self.lr) self.merged_summary_op = tf.compat.v1.summary.merge_all()
def eval(self, xs, ys): '''Predicts autoregressively At inference, input ys is ignored. Returns y_hat: (N, T2) ''' decoder_inputs, y, y_seqlen, sents2 = ys # decoder_inputs (N, 1) decoder_inputs = tf.ones( (tf.shape(xs[0])[0], 1), tf.int32) * self.token2idx["<s>"] ys = (decoder_inputs, y, y_seqlen, sents2) memory, sents1 = self.encode(xs, False) logging.info("Inference graph is being built. Please be patient.") for _ in tqdm(range(self.hp.maxlen2)): logits, y_hat, y, sents2 = self.decode(ys, memory, False) # if tf.reduce_sum(y_hat, 1) == self.token2idx["<pad>"] or \ # tf.reduce_sum(y_hat, 1) == self.token2idx["<s>"]: break # # # print('y_hat.shape = ', y_hat.shape) _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1) # print('_decoder_inputs.shape =', _decoder_inputs.shape) _decoder_inputs = tf.cond( tf.cast( tf.reduce_sum(y_hat, 1) == self.token2idx["<pad>"], tf.bool), lambda: _decoder_inputs, lambda: tf.concat( (decoder_inputs, y_hat), 1)) ys = (_decoder_inputs, y, y_seqlen, sents2) # print('ys =', ys) # loss # logits, y_hat, y, sents2 = self.decode(ys, memory, False) # _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1) # def cond(_decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits): # return tf.reduce_sum(y_hat, 1) == self.token2idx["<pad>"] or \ # tf.reduce_sum(y_hat, 1) == self.token2idx["<s>"] # def body(_decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits): # _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1) # ys = (_decoder_inputs, y, y_seqlen, sents2) # logits, y_hat, y, sents2 = self.decode(ys, memory, False) # return _decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits # _decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits = \ # tf.while_loop(cond, body, # [_decoder_inputs, y, y_seqlen, sents2, memory, y_hat, logits], # shape_invariants=[ # tf.TensorShape([None, None]), y.get_shape(), y_seqlen.get_shape(), # sents2.get_shape(), memory.get_shape(), tf.TensorShape([None, None]), # tf.TensorShape([None, None, self.hp.vocab_size]) # ]) shape_pri = tf.print('eval logits.shape, y.shape =', tf.shape(logits), tf.shape(y)) # with tf.control_dependencies([shape_pri]): y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size)) # logits = tf.Print(logits, [logits], message='logits =', summarize=10) # y_ = tf.Print(y_, [y_], message='y_ =', summarize=10) ce = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits[:, :tf.shape(y_)[1], :], labels=y_) nonpadding = tf.to_float(tf.not_equal( y, self.token2idx["<pad>"])) # 0: <pad> loss = tf.reduce_sum( ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7) # monitor a random sample n = tf.random_uniform((), 0, tf.shape(y_hat)[0] - 1, tf.int32) sent1 = sents1[n] pred = convert_idx_to_token_tensor(y_hat[n], self.idx2token) sent2 = sents2[n] tf.summary.text("sent1", sent1) tf.summary.text("pred", pred) tf.summary.text("sent2", sent2) summaries = tf.summary.merge_all() return y_hat, summaries, loss
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: # x: (32,10) y:(32,10) 一个batch32个句子,每个句子长度为10 self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) """ 定义decoder部分的input 假设真实翻译后的输出为 i am a student </S> decoder部分的input应为: <S> i am a student """ self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2代表<S>,是decoder的初始输入 # 词典 de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad=True, # 让padding一直是0 scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ##Drop out self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def construct_network(self): """ Constructs a variant of the multi-head attention labeller (MHAL) that does not use keys, queries and values, but only a simple form of additive attention, as proposed by Yang et al. (2016). """ self.word_ids = tf.placeholder(tf.int32, [None, None], name="word_ids") self.char_ids = tf.placeholder(tf.int32, [None, None, None], name="char_ids") self.sentence_lengths = tf.placeholder(tf.int32, [None], name="sentence_lengths") self.word_lengths = tf.placeholder(tf.int32, [None, None], name="word_lengths") self.sentence_labels = tf.placeholder(tf.float32, [None], name="sentence_labels") self.word_labels = tf.placeholder(tf.float32, [None, None], name="word_labels") self.word_objective_weights = tf.placeholder( tf.float32, [None, None], name="word_objective_weights") self.sentence_objective_weights = tf.placeholder( tf.float32, [None], name="sentence_objective_weights") self.learning_rate = tf.placeholder(tf.float32, name="learning_rate") self.is_training = tf.placeholder(tf.int32, name="is_training") self.loss = 0.0 if self.config["initializer"] == "normal": self.initializer = tf.random_normal_initializer(stddev=0.1) elif self.config["initializer"] == "glorot": self.initializer = tf.glorot_uniform_initializer() elif self.config["initializer"] == "xavier": self.initializer = tf.glorot_normal_initializer() zeros_initializer = tf.zeros_initializer() self.word_embeddings = tf.get_variable( name="word_embeddings", shape=[len(self.word2id), self.config["word_embedding_size"]], initializer=(zeros_initializer if self.config["emb_initial_zero"] else self.initializer), trainable=(True if self.config["train_embeddings"] else False)) word_input_tensor = tf.nn.embedding_lookup(self.word_embeddings, self.word_ids) if self.config["char_embedding_size"] > 0 and self.config[ "char_recurrent_size"] > 0: with tf.variable_scope("chars"), tf.control_dependencies([ tf.assert_equal(tf.shape(self.char_ids)[2], tf.reduce_max(self.word_lengths), message="Char dimensions don't match") ]): self.char_embeddings = tf.get_variable( name="char_embeddings", shape=[ len(self.char2id), self.config["char_embedding_size"] ], initializer=self.initializer, trainable=True) char_input_tensor = tf.nn.embedding_lookup( self.char_embeddings, self.char_ids) char_input_tensor_shape = tf.shape(char_input_tensor) char_input_tensor = tf.reshape( char_input_tensor, shape=[ char_input_tensor_shape[0] * char_input_tensor_shape[1], char_input_tensor_shape[2], self.config["char_embedding_size"] ]) _word_lengths = tf.reshape(self.word_lengths, shape=[ char_input_tensor_shape[0] * char_input_tensor_shape[1] ]) char_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell( self.config["char_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) char_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell( self.config["char_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) # Concatenate the final forward and the backward character contexts # to obtain a compact character representation for each word. _, ((_, char_output_fw), (_, char_output_bw)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=char_lstm_cell_fw, cell_bw=char_lstm_cell_bw, inputs=char_input_tensor, sequence_length=_word_lengths, dtype=tf.float32, time_major=False) char_output_tensor = tf.concat( [char_output_fw, char_output_bw], axis=-1) char_output_tensor = tf.reshape( char_output_tensor, shape=[ char_input_tensor_shape[0], char_input_tensor_shape[1], 2 * self.config["char_recurrent_size"] ]) # Include a char-based language modelling loss, LMc. if self.config["lm_cost_char_gamma"] > 0.0: self.loss += self.config["lm_cost_char_gamma"] * \ self.construct_lm_cost( input_tensor_fw=char_output_tensor, input_tensor_bw=char_output_tensor, sentence_lengths=self.sentence_lengths, target_ids=self.word_ids, lm_cost_type="separate", name="lm_cost_char_separate") if self.config["lm_cost_joint_char_gamma"] > 0.0: self.loss += self.config["lm_cost_joint_char_gamma"] * \ self.construct_lm_cost( input_tensor_fw=char_output_tensor, input_tensor_bw=char_output_tensor, sentence_lengths=self.sentence_lengths, target_ids=self.word_ids, lm_cost_type="joint", name="lm_cost_char_joint") if self.config["char_hidden_layer_size"] > 0: char_output_tensor = tf.layers.dense( inputs=char_output_tensor, units=self.config["char_hidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer) if self.config["char_integration_method"] == "concat": word_input_tensor = tf.concat( [word_input_tensor, char_output_tensor], axis=-1) elif self.config["char_integration_method"] == "none": word_input_tensor = word_input_tensor else: raise ValueError("Unknown char integration method") if self.config["dropout_input"] > 0.0: dropout_input = (self.config["dropout_input"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32))) word_input_tensor = tf.nn.dropout(word_input_tensor, dropout_input, name="dropout_word") word_lstm_cell_fw = tf.nn.rnn_cell.LSTMCell( self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) word_lstm_cell_bw = tf.nn.rnn_cell.LSTMCell( self.config["word_recurrent_size"], use_peepholes=self.config["lstm_use_peepholes"], state_is_tuple=True, initializer=self.initializer, reuse=False) with tf.control_dependencies([ tf.assert_equal(tf.shape(self.word_ids)[1], tf.reduce_max(self.sentence_lengths), message="Sentence dimensions don't match") ]): (lstm_outputs_fw, lstm_outputs_bw), ((_, lstm_output_fw), (_, lstm_output_bw)) = \ tf.nn.bidirectional_dynamic_rnn( cell_fw=word_lstm_cell_fw, cell_bw=word_lstm_cell_bw, inputs=word_input_tensor, sequence_length=self.sentence_lengths, dtype=tf.float32, time_major=False) lstm_output_states = tf.concat([lstm_output_fw, lstm_output_bw], axis=-1) if self.config["dropout_word_lstm"] > 0.0: dropout_word_lstm = (self.config["dropout_word_lstm"] * tf.cast(self.is_training, tf.float32) + (1.0 - tf.cast(self.is_training, tf.float32))) lstm_outputs_fw = tf.nn.dropout( lstm_outputs_fw, dropout_word_lstm, noise_shape=tf.convert_to_tensor([ tf.shape(self.word_ids)[0], 1, self.config["word_recurrent_size"] ], dtype=tf.int32)) lstm_outputs_bw = tf.nn.dropout( lstm_outputs_bw, dropout_word_lstm, noise_shape=tf.convert_to_tensor([ tf.shape(self.word_ids)[0], 1, self.config["word_recurrent_size"] ], dtype=tf.int32)) lstm_output_states = tf.nn.dropout(lstm_output_states, dropout_word_lstm) # The forward and backward states are concatenated at every token position. lstm_outputs_states = tf.concat([lstm_outputs_fw, lstm_outputs_bw], axis=-1) if self.config["whidden_layer_size"] > 0: lstm_outputs_states = tf.layers.dense( lstm_outputs_states, self.config["whidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer) if self.config["model_type"] == "last": processed_tensor = lstm_output_states token_scores = tf.layers.dense( lstm_outputs_states, units=len(self.label2id_tok), kernel_initializer=self.initializer, name="token_scores_last_lstm_outputs_ff") if self.config["hidden_layer_size"] > 0: processed_tensor = tf.layers.dense( processed_tensor, units=self.config["hidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer) sentence_scores = tf.layers.dense( processed_tensor, units=len(self.label2id_sent), kernel_initializer=self.initializer, name="sentence_scores_last_lstm_outputs_ff") else: with tf.variable_scope("attention"): token_scores_list = [] sentence_scores_list = [] for i in range(len(self.label2id_tok)): keys = tf.layers.dense( lstm_outputs_states, units=self.config["attention_evidence_size"], activation=tf.tanh, kernel_initializer=self.initializer) values = tf.layers.dense( lstm_outputs_states, units=self.config["attention_evidence_size"], activation=tf.tanh, kernel_initializer=self.initializer) token_scores_head = tf.layers.dense( keys, units=1, kernel_initializer=self.initializer) # [B, M, 1] token_scores_head = tf.reshape( token_scores_head, shape=tf.shape(self.word_ids)) # [B, M] token_scores_list.append(token_scores_head) if self.config["attention_activation"] == "sharp": attention_weights_unnormalized = tf.exp( token_scores_head) elif self.config["attention_activation"] == "soft": attention_weights_unnormalized = tf.sigmoid( token_scores_head) elif self.config["attention_activation"] == "linear": attention_weights_unnormalized = token_scores_head else: raise ValueError( "Unknown/unsupported token scoring method: %s" % self.config["attention_activation"]) attention_weights_unnormalized = tf.where( tf.sequence_mask(self.sentence_lengths), attention_weights_unnormalized, tf.zeros_like(attention_weights_unnormalized)) attention_weights = attention_weights_unnormalized / tf.reduce_sum( attention_weights_unnormalized, axis=1, keep_dims=True) # [B, M] processed_tensor = tf.reduce_sum( values * attention_weights[:, :, numpy.newaxis], axis=1) # [B, E] if self.config["hidden_layer_size"] > 0: processed_tensor = tf.layers.dense( processed_tensor, units=self.config["hidden_layer_size"], activation=tf.tanh, kernel_initializer=self.initializer) sentence_score_head = tf.layers.dense( processed_tensor, units=1, kernel_initializer=self.initializer, name="output_ff_head_%d" % i) # [B, 1] sentence_score_head = tf.reshape( sentence_score_head, shape=[tf.shape(processed_tensor)[0]]) # [B] sentence_scores_list.append(sentence_score_head) token_scores = tf.stack(token_scores_list, axis=-1) # [B, M, H] all_sentence_scores = tf.stack(sentence_scores_list, axis=-1) # [B, H] if len(self.label2id_tok) != len(self.label2id_sent): if len(self.label2id_sent) == 2: default_sentence_score = tf.gather(all_sentence_scores, indices=[0], axis=1) # [B, 1] maximum_non_default_sentence_score = tf.gather( all_sentence_scores, indices=list(range(1, len(self.label2id_tok))), axis=1) # [B, num_heads-1] maximum_non_default_sentence_score = tf.reduce_max( maximum_non_default_sentence_score, axis=1, keep_dims=True) # [B, 1] sentence_scores = tf.concat( [ default_sentence_score, maximum_non_default_sentence_score ], axis=-1, name="sentence_scores_concatenation") # [B, 2] else: sentence_scores = tf.layers.dense( all_sentence_scores, units=len(self.label2id_sent), kernel_initializer=self.initializer ) # [B, num_sent_labels] else: sentence_scores = all_sentence_scores # Mask the token scores that do not fall in the range of the true sentence length. # Do this for each head (change shape from [B, M] to [B, M, num_heads]). tiled_sentence_lengths = tf.tile( input=tf.expand_dims(tf.sequence_mask(self.sentence_lengths), axis=-1), multiples=[1, 1, len(self.label2id_tok)]) self.token_probabilities = tf.nn.softmax(token_scores, axis=-1) self.token_probabilities = tf.where( tiled_sentence_lengths, self.token_probabilities, tf.zeros_like(self.token_probabilities)) self.token_predictions = tf.argmax(self.token_probabilities, axis=2) self.sentence_probabilities = tf.nn.softmax(sentence_scores) self.sentence_predictions = tf.argmax(self.sentence_probabilities, axis=1) if self.config["word_objective_weight"] > 0: word_objective_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=token_scores, labels=tf.cast(self.word_labels, tf.int32)) word_objective_loss = tf.where( tf.sequence_mask(self.sentence_lengths), word_objective_loss, tf.zeros_like(word_objective_loss)) self.loss += self.config["word_objective_weight"] * tf.reduce_sum( self.word_objective_weights * word_objective_loss) if self.config["sentence_objective_weight"] > 0: self.loss += self.config[ "sentence_objective_weight"] * tf.reduce_sum( self.sentence_objective_weights * tf.nn.sparse_softmax_cross_entropy_with_logits( logits=sentence_scores, labels=tf.cast(self.sentence_labels, tf.int32))) max_over_token_heads = tf.reduce_max(self.token_probabilities, axis=1) # [B, H] one_hot_sentence_labels = tf.one_hot(tf.cast(self.sentence_labels, tf.int32), depth=len(self.label2id_sent)) if self.config["enable_label_smoothing"]: one_hot_sentence_labels_smoothed = label_smoothing( one_hot_sentence_labels, epsilon=self.config["smoothing_epsilon"]) else: one_hot_sentence_labels_smoothed = one_hot_sentence_labels # At least one token has a label corresponding to the true sentence label. # This loss also pushes the maximums over the other heads towards 0 (but smoothed). if self.config["type1_attention_objective_weight"] > 0: this_max_over_token_heads = max_over_token_heads if len(self.label2id_tok) != len(self.label2id_sent): if len(self.label2id_sent) == 2: max_default_head = tf.gather(max_over_token_heads, indices=[0], axis=-1) # [B, 1] max_non_default_head = tf.reduce_max( tf.gather(max_over_token_heads, indices=list(range(1, len(self.label2id_tok))), axis=-1), axis=1, keep_dims=True) # [B, 1] this_max_over_token_heads = tf.concat( [max_default_head, max_non_default_head], axis=-1) # [B, 2] else: raise ValueError( "Unsupported attention loss for num_heads != num_sent_lables " "and num_sentence_labels != 2.") self.loss += self.config["type1_attention_objective_weight"] * ( tf.reduce_sum(self.sentence_objective_weights * tf.reduce_sum( tf.square(this_max_over_token_heads - one_hot_sentence_labels_smoothed), axis=-1))) # The predicted distribution over the token labels (heads) should be similar to the # predicted distribution over the sentence representations. if self.config["type2_attention_objective_weight"] > 0: all_sentence_scores_probabilities = tf.nn.softmax( all_sentence_scores) # [B, H] self.loss += self.config["type2_attention_objective_weight"] * ( tf.reduce_sum(self.sentence_objective_weights * tf.reduce_sum( tf.square(max_over_token_heads - all_sentence_scores_probabilities), axis=-1))) # At least one token has a label corresponding to the true sentence label. if self.config["type3_attention_objective_weight"] > 0: this_max_over_token_heads = max_over_token_heads if len(self.label2id_tok) != len(self.label2id_sent): if len(self.label2id_sent) == 2: max_default_head = tf.gather(max_over_token_heads, indices=[0], axis=-1) # [B, 1] max_non_default_head = tf.reduce_max( tf.gather(max_over_token_heads, indices=list(range(1, len(self.label2id_tok))), axis=-1), axis=1, keep_dims=True) # [B, 1] this_max_over_token_heads = tf.concat( [max_default_head, max_non_default_head], axis=-1) # [B, 2] else: raise ValueError( "Unsupported attention loss for num_heads != num_sent_lables " "and num_sentence_labels != 2.") self.loss += self.config["type3_attention_objective_weight"] * ( tf.reduce_sum( self.sentence_objective_weights * tf.reduce_sum(tf.square( (this_max_over_token_heads * one_hot_sentence_labels) - one_hot_sentence_labels_smoothed), axis=-1))) # A sentence that has a default label, should only contain tokens labeled as default. if self.config["type4_attention_objective_weight"] > 0: default_head = tf.gather(self.token_probabilities, indices=[0], axis=-1) # [B, M, 1] default_head = tf.squeeze(default_head, axis=-1) # [B, M] self.loss += self.config["type4_attention_objective_weight"] * ( tf.reduce_sum( self.sentence_objective_weights * tf.cast(tf.equal(self.sentence_labels, 0.0), tf.float32) * tf.reduce_sum( tf.square(default_head - tf.ones_like(default_head)), axis=-1))) # Every sentence has at least one default label. if self.config["type5_attention_objective_weight"] > 0: default_head = tf.gather(self.token_probabilities, indices=[0], axis=-1) # [B, M, 1] max_default_head = tf.reduce_max(tf.squeeze(default_head, axis=-1), axis=-1) # [B] self.loss += self.config["type5_attention_objective_weight"] * ( tf.reduce_sum(self.sentence_objective_weights * tf.square(max_default_head - tf.ones_like(max_default_head)))) # Include a word-based language modelling loss, LMw. if self.config["lm_cost_lstm_gamma"] > 0.0: self.loss += self.config[ "lm_cost_lstm_gamma"] * self.construct_lm_cost( input_tensor_fw=lstm_outputs_fw, input_tensor_bw=lstm_outputs_bw, sentence_lengths=self.sentence_lengths, target_ids=self.word_ids, lm_cost_type="separate", name="lm_cost_lstm_separate") if self.config["lm_cost_joint_lstm_gamma"] > 0.0: self.loss += self.config[ "lm_cost_joint_lstm_gamma"] * self.construct_lm_cost( input_tensor_fw=lstm_outputs_fw, input_tensor_bw=lstm_outputs_bw, sentence_lengths=self.sentence_lengths, target_ids=self.word_ids, lm_cost_type="joint", name="lm_cost_lstm_joint") self.train_op = self.construct_optimizer( opt_strategy=self.config["opt_strategy"], loss=self.loss, learning_rate=self.learning_rate, clip=self.config["clip"]) print("Notwork built.")