def forward(self, context, question, con_lens, qu_lens, step=0): context_t = context.transpose(0, 1).contiguous() # (batch, T, 2d) context_size = context.size() question_size = question.size() context_len = context_size[0] question_len = question_size[0] S = Variable(torch.zeros(context_size[1], context_len, question_len)) # (batch, T, J) if config['USE_CUDA']: S = S.cuda(context.get_device()) for t in range(context_len): for j in range(question_len): c = context[t, :, :] # (batch, 2d) q = question[j, :, :] # (batch, 2d) cat = torch.cat([c, q, c * q], dim=-1) att = torch.mm(cat, self.att_w) # (batch, 1) S[:, t, j] = att S = exp_mask(S, con_lens, qu_lens) if config['sigmoid']: c2q_att_w = torch.sigmoid( S.view(-1, question_len).contiguous()).view( context_size[1], context_len, question_len).contiguous() # (batch, T, J) else: c2q_att_w = self.softmax( S.view(-1, question_len).contiguous()).view( context_size[1], context_len, question_len).contiguous() # (batch, T, J) c2q_att = torch.bmm(c2q_att_w, question.transpose( 0, 1).contiguous()) # U~: (batch, T, 2d) value, index = torch.max(S, 2) # value(batch, T) # if config['sigmoid']: # value = torch.sigmoid(value) # else: value = self.softmax(value) value = value.unsqueeze(1).expand( context_size[1], context_size[0], context_size[0]) # value(batch, T, T) q2c_att = torch.bmm(value, context_t) # H~: (batch, T, 2d) G = torch.cat( [context_t, c2q_att, context_t * c2q_att, context_t * q2c_att], dim=2) mask = Variable( get_source_mask(context_size[1], context_size[2] * 4, context_size[0], con_lens)) mask = mask.transpose(0, 1) if config['USE_CUDA']: mask = mask.cuda(context.get_device()) G = G * mask # logger.histo_summary('AttentionFlowLayer/output', to_np(G), step) if config['gate']: gate = torch.matmul(G, self.gate_weight) gate = torch.sigmoid(gate) G = gate * G return G # (batch, T, 8d)
def forward(self, h_0, G, con_lens, step=0): G_t = G.transpose(0, 1).contiguous() M, h_n = self.rnn(G_t, h_0) # M: (T, batch, 2d) # logger.histo_summary('ModelingOutLayer/rnn_output', to_np(M), step) size = M.size() mask = Variable(get_source_mask(size[1], size[2], size[0], con_lens)) if config['USE_CUDA']: mask = mask.cuda(M.get_device()) M = M * mask M = self.dropout(M) return M.transpose(0, 1).contiguous() # M: (batch, T, 2d)
def forward(self, l_ctx_emb, r_ctx_emb, l_ctx_lens, r_ctx_lens): """ args: l_ctx_emb: (B, S, word_emb) r_ctx_emb: (B, S, word_emb) l_ctx_lens: list r_ctx_lens: list :return: """ batch_size = l_ctx_emb.size(0) h_0 = Variable( torch.zeros(fg_config['lstm_layers'] * 2, batch_size, fg_config['hidden_size'])) c_0 = Variable( torch.zeros(fg_config['lstm_layers'] * 2, batch_size, fg_config['hidden_size'])) if fg_config['USE_CUDA']: h_0 = h_0.cuda(fg_config['cuda_num']) c_0 = c_0.cuda(fg_config['cuda_num']) # (S, B, hidden_size*2) l_ctx_lstm, _ = self.l_lstm(l_ctx_emb.transpose(0, 1), (h_0, c_0)) # (S, B, hidden_size*2) r_ctx_lstm, _ = self.r_lstm(r_ctx_emb.transpose(0, 1), (h_0, c_0)) l_mask = Variable( get_source_mask(batch_size, fg_config['hidden_size'] * 2, fg_config['ctx_window_size'], l_ctx_lens)) r_mask = Variable( get_source_mask(batch_size, fg_config['hidden_size'] * 2, fg_config['ctx_window_size'], r_ctx_lens)) if fg_config['USE_CUDA']: l_mask = l_mask.cuda(fg_config['cuda_num']) r_mask = r_mask.cuda(fg_config['cuda_num']) l_ctx_lstm = l_ctx_lstm * l_mask r_ctx_lstm = r_ctx_lstm * r_mask l_ctx_lstm = self.dropout(l_ctx_lstm) r_ctx_lstm = self.dropout(r_ctx_lstm) # (S, B, hidden_size*2) return l_ctx_lstm, r_ctx_lstm
def forward(self, input, h_0, seq_length, step=0, name='Q'): input_size = input.size() word_slice = input[:, :, 0] word_emb = self.word_embedding( word_slice) # (batch, seq_length, word_emb) trans_emb = nn.functional.relu(self.linear(word_emb)) out = self.dropout(trans_emb).transpose(0, 1) # (seq_length, batch, 2*d) mask = Variable( get_source_mask(input_size[0], config['hidden_size'] * 2, input_size[1], seq_length)) if config['USE_CUDA']: mask = mask.cuda(input.get_device()) out = out * mask return out
def forward(self, M, G, h_0, con_lens): """ :param M: (batch, T, 2d) :param G: (batch, T, 8d) :param h_0: (num_layers * num_directions, batch, hidden_size) :return: (batch, T) """ M_t = M.transpose(0, 1) M_2, h_n = self.rnn(M_t, h_0) # M_2:(T, batch, 2d) size = M_2.size() mask = Variable(get_source_mask(size[1], size[2], size[0], con_lens)) if config['USE_CUDA']: mask = mask.cuda(M.get_device()) M_2 = M_2 * mask M_2 = self.dropout(M_2) cat = torch.cat([G, M_2.transpose(0, 1)], 2) # cat: (batch, T, 10d) logits = self.W(cat) # (batch, T, 1) logits = self.dropout(logits) logits = logits.squeeze(2) logits = exp_mask_2d(logits, con_lens) probs = self.logSoftmax(logits) # (batch, T) return probs
def crf_eval_one_sen(config, encoder, bidencoder, decoder, this_batch): this_batch_num = len(this_batch[3]) this_batch_max_seq = max(this_batch[3]) last_hidden = Variable(torch.zeros(1, this_batch_num, config['hidden_size'])) bid_init_hidden = Variable(torch.zeros(config['decoder_layers'] * 2, this_batch_num, config['hidden_size'])) word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor)) data = Variable(this_batch[0], volatile=True) # target = Variable(this_batch[1], volatile=True) length = Variable(torch.LongTensor(this_batch[3]), volatile=True) h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size'] / 2), volatile=True) # encoder gru initial hidden state if config['USE_CUDA']: last_hidden = last_hidden.cuda(config['cuda_num']) word_input = word_input.cuda(config['cuda_num']) data = data.cuda(config['cuda_num']) # target = target.cuda(config['cuda_num']) length = length.cuda(config['cuda_num']) h_0 = h_0.cuda(config['cuda_num']) bid_init_hidden = bid_init_hidden.cuda(config['cuda_num']) encoder_outputs = encoder(0, data, h_0, this_batch[3]) # encoder_outputs = encoder_outputs.transpose(1, 2) # encoder_outputs = encoder_outputs.transpose(0, 1) source_mask = Variable( get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3])) if config['USE_CUDA']: source_mask = source_mask.cuda(config['cuda_num']) encoder_outputs = encoder_outputs * source_mask encoder_outputs = bidencoder(bid_init_hidden, encoder_outputs, this_batch[3]) crf_mask = Variable( get_target_mask(this_batch_num, max(this_batch[3]), this_batch[3])) if config['USE_CUDA']: crf_mask = crf_mask.type(torch.cuda.ByteTensor) else: crf_mask = crf_mask.type(torch.ByteTensor) lst_decode = decoder(encoder_outputs, crf_mask) return lst_decode
def train_iteration(logger, config, my_arg, step, encoder, bidencoder, decoder, encoder_optimizer, bidencoder_optimizer, decoder_optimizer, this_batch): # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size'])) decoder_optimizer.zero_grad() encoder_optimizer.zero_grad() this_batch_num = len(this_batch[2]) this_batch_max_target = max(this_batch[2]) last_hidden = Variable(torch.zeros(1, this_batch_num, config['hidden_size'])) bid_init_hidden = Variable(torch.zeros(config['decoder_layers']*2, this_batch_num, config['hidden_size'])) word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor)) print 'seq_length', max(this_batch[3]), 'label_length', this_batch_max_target # (output_size, B, 1) data = Variable(this_batch[0]) target = Variable(this_batch[1]) target_length = Variable(torch.LongTensor(this_batch[2])) h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size']/2)) # encoder gru initial hidden state if config['USE_CUDA']: last_hidden = last_hidden.cuda(config['cuda_num']) word_input = word_input.cuda(config['cuda_num']) data = data.cuda(config['cuda_num']) target = target.cuda(config['cuda_num']) target_length = target_length.cuda(config['cuda_num']) h_0 = h_0.cuda(config['cuda_num']) bid_init_hidden = bid_init_hidden.cuda(config['cuda_num']) encoder_outputs = encoder(step, data, h_0, this_batch[3]) source_mask = Variable(get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3])) if config['USE_CUDA']: source_mask = source_mask.cuda(config['cuda_num']) encoder_outputs = encoder_outputs * source_mask encoder_outputs = bidencoder(bid_init_hidden, encoder_outputs, this_batch[3]) seq_label_prob = Variable(torch.zeros(this_batch_max_target, this_batch_num, config['decoder_output_size'])) if config['USE_CUDA']: seq_label_prob = seq_label_prob.cuda(config['cuda_num']) rate = schedule_samp_rate(step) # rate=0 for time_step in range(this_batch_max_target): label_logits, cur_hidden = decoder(step, word_input, last_hidden, encoder_outputs[time_step]) last_hidden = cur_hidden seq_label_prob[time_step] = label_logits # Choose top word from label_prob # value, label = label_prob.topk(1) # decoder_out_label.append(label) # not teacher-forcing # word_input = label # teacher-forcing if my_arg == 0: word_input = target[:, time_step] else: # value, label = label_logits.data.topk(1) # decoder_out_label.append(label) # word_input = Variable(label) # Chosen word is next input # if config['USE_CUDA']: # word_input = word_input.cuda(config['cuda_num']) a = random_pick([0, 1], [rate, 1 - rate]) if a == 0: word_input = target[:, time_step] else: value, label = label_logits.data.topk(1) # decoder_out_label.append(label) word_input = Variable(label) # Chosen word is next input if config['USE_CUDA']: word_input = word_input.cuda(config['cuda_num']) loss = masked_cross_entropy(seq_label_prob.transpose(0, 1).contiguous(), target, target_length) # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length) print 'loss: ', loss.data[0] logger.scalar_summary('loss', loss.data[0], step) loss.backward() e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()] b_before_step = [(tag, to_np(value)) for tag, value in bidencoder.named_parameters()] d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()] clip_grad_norm(decoder.parameters(), config['clip_norm']) clip_grad_norm(encoder.parameters(), config['clip_norm']) decoder_optimizer.step() encoder_optimizer.step() bidencoder_optimizer.step() e_after_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()] b_after_step = [(tag, to_np(value)) for tag, value in bidencoder.named_parameters()] d_after_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()] for before, after in zip(e_before_step, e_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step) for before, after in zip(b_before_step, b_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step) for before, after in zip(d_before_step, d_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step)
def train_iteration(step, encoder, decoder, encoder_optimizer, decoder_optimizer, this_batch): # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size'])) decoder_optimizer.zero_grad() encoder_optimizer.zero_grad() this_batch_num = len(this_batch[2]) this_batch_max_seq = max(this_batch[2]) last_hidden = Variable( torch.zeros(config['decoder_layers'], this_batch_num, config['hidden_size'])) word_input = Variable( torch.zeros(this_batch_num, 1).type(torch.LongTensor)) data = Variable(this_batch[0]) target = Variable(this_batch[1]) length = Variable(torch.LongTensor(this_batch[2])) h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size'] / 2)) # encoder gru initial hidden state print 'seq_length', max( this_batch[3] ), 'label_length', this_batch_max_seq # (output_size, B, 1) if config['USE_CUDA']: last_hidden = last_hidden.cuda(config['multi_cuda'][0]) word_input = word_input.cuda(config['multi_cuda'][0]) data = data.cuda(config['multi_cuda'][0]) target = target.cuda(config['multi_cuda'][0]) length = length.cuda(config['multi_cuda'][0]) h_0 = h_0.cuda(config['multi_cuda'][0]) step = Variable(torch.ones(len(config['multi_cuda']), 1) * step) h_0 = h_0.transpose(0, 1) batch_seq_len = Variable(torch.LongTensor(this_batch[3])).unsqueeze(1) if config['use_multi']: step = step.cuda(config['multi_cuda'][0]) batch_seq_len = batch_seq_len.cuda(config['multi_cuda'][0]) encoder_outputs = encoder(step, data, h_0, batch_seq_len) encoder_outputs = encoder_outputs.transpose(0, 1) # encoder_outputs = encoder_outputs.transpose(1,2) # encoder_outputs = encoder_outputs.transpose(0,1) source_mask = Variable( get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3])) if config['USE_CUDA']: source_mask = source_mask.cuda(config['multi_cuda'][0]) encoder_outputs = encoder_outputs * source_mask # decoder = BahdanauAttnDecoderRNN(config, config['encoder_outputs_size'], config['hidden_size'], config['decoder_output_size'], config['decoder_layers']) # decoder_optimizer = torch.optim.Adadelta(decoder.parameters()) # encoder_optimizer = torch.optim.Adadelta(encoder.parameters()) # word_input = Variable(torch.LongTensor([[0], [1]])) # target = Variable(torch.LongTensor([[1,0,1,0,1,0,1,0],[0,1,0,1,0,1,0,1]])) # (batch, max_label_length) # length = Variable(torch.LongTensor([5,7])) # decoder.cuda(config['cuda_num']) # train decoder_out_label = [] encoder_outputs = encoder_outputs.transpose(0, 1) seq_label_prob = Variable( torch.zeros(this_batch_max_seq, this_batch_num, config['decoder_output_size'])) if config['USE_CUDA']: seq_label_prob = seq_label_prob.cuda(config['multi_cuda'][0]) for time_step in range(this_batch_max_seq): last_hidden = last_hidden.transpose(0, 1) label_prob, cur_hidden, attn_weights = decoder(step, word_input, last_hidden, encoder_outputs) cur_hidden = cur_hidden.transpose(0, 1) last_hidden = cur_hidden seq_label_prob[time_step] = label_prob # Choose top word from label_prob # value, label = label_prob.topk(1) # decoder_out_label.append(label.data) # not teacher-forcing # word_input = label # teacher-forcing word_input = target[:, time_step] decoder_prob = Variable( torch.FloatTensor([[[0, 1], [1, 0]], [[0, 1], [1, 0]], [[1, 0], [0, 1]], [[1, 0], [0, 1]], [[0, 1], [1, 0]], [[0, 1], [1, 0]], [[1, 0], [0, 1]], [[1, 0], [0, 1]]])) if config['USE_CUDA']: decoder_prob = decoder_prob.cuda(config['multi_cuda'][0]) loss = masked_cross_entropy( seq_label_prob.transpose(0, 1).contiguous(), target, length) # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length) print 'loss: ', loss.data[0] # logger.scalar_summary('loss', loss.data[0], step.data[0, 0]) loss.backward() clip_grad_norm(decoder.parameters(), config['clip_norm']) clip_grad_norm(encoder.parameters(), config['clip_norm']) # for tag, value in encoder.named_parameters(): # tag = tag.replace('.', '/') # tag = tag.replace('module', 'encoder') # logger.histo_summary(tag, to_np(value), step.data[0, 0]) # logger.histo_summary(tag + '/grad', to_np(value.grad), step.data[0, 0]) # for tag, value in decoder.named_parameters(): # tag = tag.replace('.', '/') # tag = tag.replace('module', 'decoder') # logger.histo_summary(tag, to_np(value), step.data[0, 0]) # logger.histo_summary(tag + '/grad', to_np(value.grad), step.data[0, 0]) decoder_optimizer.step() encoder_optimizer.step()
def forward(self, input, h_0, seq_length, word_embedding, gemb, char_emb, char_conv, highway, step=0, name='Q'): input_size = input.size() outs = [] word_slice = input[:, :, 0] word_emb = word_embedding(word_slice) # (batch, seq_length, word_emb) outs.append(word_emb) curr_end = 1 if self._use_gaz: gazStart = curr_end gazEnd = gazStart + self.gazsize if config['USE_CUDA']: a = input[:, :, gazStart:gazEnd] b = a.type(torch.cuda.FloatTensor) c = b.view(-1, self.gazsize) d = gemb(c) e = d.view(input_size[0], input_size[1], -1) f = e.contiguous() outs.append(f) # outs.append(self.gemb(input[:, :, gazStart:gazEnd].type(torch.cuda.FloatTensor).view(-1, self.gazsize)).view(input_size[0], input_size[1], -1).contiguous()) else: outs.append( self.gemb(input[:, :, gazStart:gazEnd].type( torch.FloatTensor).view(-1, self.gazsize)).view( input_size[0], input_size[1], -1).contiguous()) curr_end = gazEnd if self._use_char_conv: chars = input[:, :, curr_end:curr_end + self.char_len].contiguous() chars_mask = input[:, :, (curr_end + self.char_len):(curr_end + 2 * self.char_len)] if config['USE_CUDA']: chars_mask = chars_mask.type(torch.cuda.FloatTensor) else: chars_mask = chars_mask.type(torch.FloatTensor) chars_size = chars.size() char_view = chars.view( -1, self.char_len) # (B*seq_length, max_char_len) char_emb_out = char_emb( char_view) # (B*seq_length, max_char_len, char_emb) chars_mask = chars_mask.view( -1, self.char_len) # (B*seq_length, max_char_len) char_emb_out = char_emb_out * chars_mask.unsqueeze(2).expand_as( char_emb_out) # char_shape = char_emb_out.shape # char_emb_out = char_emb_out.reshape((char_shape[0] * char_shape[1], char_shape[2], 1, char_shape[3])) # char_conv_out = self.char_conv.apply(char_emb_out) # char_conv_out = self.conv_active.apply(char_conv_out) # char_conv_out = char_conv_out.reshape(char_shape) # char_conv_out = char_conv_out * chars_mask.dimshuffle(0, 1, 2, 'x') # char_conv_out = tensor.max(char_conv_out, axis=2) char_emb_out = char_emb_out.transpose( 1, 2) # (B*seq_length, char_emb, char_len) char_conv_out = char_conv( char_emb_out) # (B*seq_length, out_channel, char_len) char_conv_out = self.conv_active(char_conv_out) char_conv_out = char_conv_out.transpose( 1, 2) # (B*seq_length, char_len, out_channel) char_conv_out = char_conv_out * chars_mask.unsqueeze(2).expand_as( char_conv_out) # (B*seq_length, char_len, out_channel) char_conv_out, _ = torch.max(char_conv_out, 1) char_conv_out = char_conv_out.view( chars_size[0], chars_size[1], -1) # (B, seq_length, out_channel) outs.append(char_conv_out) output = torch.cat(outs, dim=-1) mask = Variable( get_source_mask(input_size[0], self.out_dim, input_size[1], seq_length)) if config['USE_CUDA']: mask = mask.cuda(input.get_device()) mask = mask.transpose(0, 1) embedded = output * mask # embedded: (batch, seq_length, emb_size) embedded = embedded.view(-1, self.out_dim).contiguous() embedded = highway(embedded) embedded = embedded.view(input_size[0], input_size[1], -1).contiguous() embedded = embedded * mask embedded = embedded.transpose( 0, 1).contiguous() # embedded: (seq_length, batch, emb_size) embedded = self.question_trans(embedded) # rnn_output, h_n = self.encode_rnn(embedded, h_0) rnn_mask = Variable( get_source_mask(input_size[0], self.out_dim * 2, input_size[1], seq_length)) if config['USE_CUDA']: rnn_mask = rnn_mask.cuda(input.get_device()) rnn_output = embedded * rnn_mask if config['use_dropout']: rnn_output = self.dropout(rnn_output) # logger.histo_summary('EmbeddingLayer/output', to_np(rnn_output), step) return rnn_output # (seq_len, batch, hidden_size(100+100=d) * num_directions(2))
def train_iteration(logger, step, embedding_layer, q_word_embedding, q_emb_layer, att_layer, model_layer, ner_hw_layer, ner_out_layer, crf, emb_opt, q_emb_opt, att_opt, model_opt, ner_hw_opt, ner_out_opt, crf_opt, this_batch): if not config['freeze']: emb_opt.zero_grad() att_opt.zero_grad() model_opt.zero_grad() if config['question_alone']: q_emb_opt.zero_grad() ner_out_opt.zero_grad() crf_opt.zero_grad() ner_hw_opt.zero_grad() d = config['hidden_size'] this_batch_num = len(this_batch[2]) question = Variable(this_batch[4]) question_lengths = this_batch[5] context = Variable(this_batch[0]) # (batch, T, 51) context_lengths = this_batch[3] # list target = Variable(this_batch[1]) # (batch, T) emb_h_0 = Variable(torch.zeros(2, this_batch_num, d)) model_out_h_0 = Variable( torch.zeros(2 * model_layer.num_layers, this_batch_num, d)) con_lens_var = Variable(torch.LongTensor(context_lengths)) if config['USE_CUDA']: question = question.cuda(config['cuda_num']) context = context.cuda(config['cuda_num']) target = target.cuda(config['cuda_num']) emb_h_0 = emb_h_0.cuda(config['cuda_num']) model_out_h_0 = model_out_h_0.cuda(config['cuda_num']) con_lens_var = con_lens_var.cuda(config['cuda_num']) c_emb = embedding_layer(context, emb_h_0, context_lengths, step, name='C') if config['question_alone']: q_emb = q_emb_layer(question, emb_h_0, question_lengths, step, name='Q') else: q_emb = embedding_layer(question, emb_h_0, question_lengths, step, q_word_embedding, 'Q') G = att_layer(c_emb, q_emb, context_lengths, question_lengths, step) M = model_layer(model_out_h_0, G, context_lengths, step) if config['not_pretrain']: M_trans = M G_trans = G else: M_trans, G_trans = ner_hw_layer(M, G) prob = ner_out_layer(M_trans, G_trans, context_lengths) prob_size = prob.size() mask = Variable( get_source_mask(prob_size[0], prob_size[2], prob_size[1], context_lengths)) mask = mask.transpose(0, 1) if config['USE_CUDA']: mask = mask.cuda(context.get_device()) prob = prob * mask crf_mask = Variable( get_target_mask(this_batch_num, max(context_lengths), context_lengths)) if config['USE_CUDA']: crf_mask = crf_mask.type(torch.cuda.ByteTensor) crf_mask = crf_mask.cuda(config['cuda_num']) else: crf_mask = crf_mask.type(torch.ByteTensor) loss = crf.neg_log_likelihood( prob.transpose(0, 1).contiguous(), target.transpose(0, 1), crf_mask, context_lengths) # loss = masked_cross_entropy(prob, target, con_lens_var) if step % 100 == 0: print('loss: ', loss.data[0]) logger.scalar_summary('loss', loss.data[0], step) loss.backward() # e_before_step = [(tag, to_np(value)) for tag, value in embedding_layer.named_parameters()] # a_before_step = [(tag, to_np(value)) for tag, value in att_layer.named_parameters()] # m_before_step = [(tag, to_np(value)) for tag, value in model_layer.named_parameters()] # h_before_step = [(tag, to_np(value)) for tag, value in ner_hw_layer.named_parameters()] # n_before_step = [(tag, to_np(value)) for tag, value in ner_out_layer.named_parameters()] # c_before_step = [(tag, to_np(value)) for tag, value in crf.named_parameters()] # q_before_step = [(tag, to_np(value)) for tag, value in q_emb_layer.named_parameters()] clip_grad_norm(embedding_layer.parameters(), config['clip_norm']) clip_grad_norm(att_layer.parameters(), config['clip_norm']) clip_grad_norm(model_layer.parameters(), config['clip_norm']) clip_grad_norm(ner_hw_layer.parameters(), config['clip_norm']) clip_grad_norm(ner_out_layer.parameters(), config['clip_norm']) clip_grad_norm(crf.parameters(), config['clip_norm']) if config['question_alone']: clip_grad_norm(q_emb_layer.parameters(), config['clip_norm']) # for tag, value in embedding_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in att_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in model_out_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in ner_out_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # # for tag, value in crf.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) if not config['freeze']: emb_opt.step() att_opt.step() model_opt.step() ner_hw_opt.step() ner_out_opt.step() crf_opt.step() if config['question_alone']: q_emb_opt.step() grad_ratio_lst = []
def train_iteration(logger, config, my_arg, step, encoder, decoder, encoder_optimizer, decoder_optimizer, this_batch): # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size'])) decoder_optimizer.zero_grad() encoder_optimizer.zero_grad() this_batch_num = len(this_batch[2]) this_batch_max_target = max(this_batch[2]) last_hidden = Variable( torch.zeros(config['decoder_layers'], this_batch_num, config['hidden_size'])) word_input = Variable( torch.zeros(this_batch_num, 1).type(torch.LongTensor)) print 'seq_length', max( this_batch[3] ), 'label_length', this_batch_max_target # (output_size, B, 1) data = Variable(this_batch[0]) target = Variable(this_batch[1]) target_length = Variable(torch.LongTensor(this_batch[2])) h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size'] / 2)) # encoder gru initial hidden state if config['USE_CUDA']: last_hidden = last_hidden.cuda(config['cuda_num']) word_input = word_input.cuda(config['cuda_num']) data = data.cuda(config['cuda_num']) target = target.cuda(config['cuda_num']) target_length = target_length.cuda(config['cuda_num']) h_0 = h_0.cuda(config['cuda_num']) encoder_outputs = encoder(step, data, h_0, this_batch[3]) # encoder_outputs = encoder_outputs.transpose(1,2) # encoder_outputs = encoder_outputs.transpose(0,1) source_mask = Variable( get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3])) if config['USE_CUDA']: source_mask = source_mask.cuda(config['cuda_num']) encoder_outputs = encoder_outputs * source_mask # decoder = BahdanauAttnDecoderRNN(config, config['encoder_outputs_size'], config['hidden_size'], config['decoder_output_size'], config['decoder_layers']) # decoder_optimizer = torch.optim.Adadelta(decoder.parameters()) # encoder_optimizer = torch.optim.Adadelta(encoder.parameters()) # word_input = Variable(torch.LongTensor([[0], [1]])) # target = Variable(torch.LongTensor([[1,0,1,0,1,0,1,0],[0,1,0,1,0,1,0,1]])) # (batch, max_label_length) # length = Variable(torch.LongTensor([5,7])) # decoder.cuda(config['cuda_num']) # train decoder_out_label = [] seq_label_prob = Variable( torch.zeros(this_batch_max_target, this_batch_num, config['decoder_output_size'])) if config['USE_CUDA']: seq_label_prob = seq_label_prob.cuda(config['cuda_num']) rate = schedule_samp_rate(step) # rate=0 for time_step in range(this_batch_max_target): label_logits, cur_hidden = decoder(step, word_input, last_hidden, encoder_outputs[time_step]) last_hidden = cur_hidden seq_label_prob[time_step] = label_logits # Choose top word from label_prob # value, label = label_prob.topk(1) # decoder_out_label.append(label) # not teacher-forcing # word_input = label # teacher-forcing if my_arg == 0: word_input = target[:, time_step] else: # value, label = label_logits.data.topk(1) # # decoder_out_label.append(label) # word_input = Variable(label) # Chosen word is next input # if config['USE_CUDA']: # word_input = word_input.cuda(config['cuda_num']) a = random_pick([0, 1], [rate, 1 - rate]) if a == 0: word_input = target[:, time_step] else: value, label = label_logits.data.topk(1) # decoder_out_label.append(label) word_input = Variable(label) # Chosen word is next input if config['USE_CUDA']: word_input = word_input.cuda(config['cuda_num']) # decoder_prob = Variable(torch.FloatTensor([[[0,1],[1,0]],[[0,1],[1,0]],[[1,0],[0,1]],[[1,0],[0,1]], [[0,1],[1,0]],[[0,1],[1,0]],[[1,0],[0,1]],[[1,0],[0,1]]])) # # if config['USE_CUDA']: # decoder_prob = decoder_prob.cuda(config['cuda_num']) loss = masked_cross_entropy( seq_label_prob.transpose(0, 1).contiguous(), target, target_length) # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length) print 'loss: ', loss.data[0] logger.scalar_summary('loss', loss.data[0], step) loss.backward() # e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()] # d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()] clip_grad_norm(decoder.parameters(), config['clip_norm']) clip_grad_norm(encoder.parameters(), config['clip_norm']) # for tag, value in encoder.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in decoder.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) decoder_optimizer.step() encoder_optimizer.step()
def forward(self, step, input, h_0, seq_length): self.encoder_gru.flatten_parameters() input_size = input.size() outs = [] for idx in range(self.embeddings): # outs.append(emb.apply(input_[:,:,idx:(idx+1)])) emb = getattr(self, 'embedding'+str(idx)) word_slice = input[:, :, idx] word_emb = emb(word_slice) outs.append(word_emb) curr_end = self.embeddings if self._use_gaz: gazStart = curr_end gazEnd = gazStart + self.gazsize if config['USE_CUDA']: outs.append(self.gemb(input[:, :, gazStart:gazEnd].type(torch.cuda.FloatTensor).view(-1, self.gazsize)).view(input_size[0], input_size[1], -1).contiguous()) else: outs.append(self.gemb(input[:, :, gazStart:gazEnd].type(torch.FloatTensor).view(-1, self.gazsize)).view(input_size[0], input_size[1], -1).contiguous()) curr_end = gazEnd if self._use_char_conv: chars = input[:, :, curr_end:curr_end + self.char_len].contiguous() chars_mask = input[:, :, (curr_end + self.char_len):(curr_end + 2 * self.char_len)] if config['USE_CUDA']: chars_mask = chars_mask.type(torch.cuda.FloatTensor) else: chars_mask = chars_mask.type(torch.FloatTensor) chars_size = chars.size() char_view = chars.view(-1, self.char_len) char_emb_out = self.char_emb(char_view) # char_shape = char_emb_out.shape # char_emb_out = char_emb_out.reshape((char_shape[0] * char_shape[1], char_shape[2], 1, char_shape[3])) # char_conv_out = self.char_conv.apply(char_emb_out) # char_conv_out = self.conv_active.apply(char_conv_out) # char_conv_out = char_conv_out.reshape(char_shape) # char_conv_out = char_conv_out * chars_mask.dimshuffle(0, 1, 2, 'x') # char_conv_out = tensor.max(char_conv_out, axis=2) char_emb_out = char_emb_out.transpose(1, 2) char_conv_out = self.char_conv(char_emb_out) char_conv_out = self.conv_active(char_conv_out) char_conv_out = char_conv_out.transpose(1, 2) chars_mask = chars_mask.view(-1, self.char_len) char_conv_out = char_conv_out * chars_mask.unsqueeze(2).expand_as(char_conv_out) char_conv_out, _ = torch.max(char_conv_out, 1) char_conv_out = char_conv_out.view(chars_size[0], chars_size[1], -1) outs.append(char_conv_out) output = torch.cat(outs, dim=-1) mask = Variable(get_source_mask(input_size[0], self.out_dim, input_size[1], seq_length)) if config['USE_CUDA']: mask = mask.cuda(config['cuda_num']) if config['use_multi']: mask = mask.cuda(input.get_device()) mask = mask.transpose(0, 1) embedded = output * mask embedded = self.dropout(embedded) # logger.histo_summary('embedded', to_np(embedded), step) # embedded = self.embedding0(input) # embedded: (batch, seq_length, emb_size) output = embedded.transpose(1, 2) # embedded: (batch, emb_size, seq_length) for i in range(3): conv = getattr(self, 'conv'+str(i)) output = self.relu(conv(output)) # output: (batch, encoder_filter_num, seq_length) output = self.dropout(output) output = output.transpose(1, 2) output = output.transpose(0, 1).contiguous() # output: (seq_length, batch, encoder_filter_num) # return self.dropout(output) gru_output, h_n = self.encoder_gru(output, h_0) return self.dropout(gru_output) # (seq_len, batch, hidden_size * num_directions)
def evaluate_one(step, embedding_layer, q_word_embedding, q_emb_layer, att_layer, model_layer, ner_hw_layer, ner_out_layer, crf, this_batch, summary_emb=False, all_emb=None, all_metadata=None): d = config['hidden_size'] this_batch_num = len(this_batch[2]) question = Variable(this_batch[4]) question_lengths = this_batch[5] context = Variable(this_batch[0], volatile=True) # (batch, T, 51) context_lengths = this_batch[3] # list target = Variable(this_batch[1], volatile=True) # (batch, T) emb_h_0 = Variable(torch.zeros(2, this_batch_num, d), volatile=True) model_out_h_0 = Variable(torch.zeros(2 * model_layer.num_layers, this_batch_num, d), volatile=True) con_lens_var = Variable(torch.LongTensor(context_lengths), volatile=True) if config['USE_CUDA']: question = question.cuda(config['cuda_num']) context = context.cuda(config['cuda_num']) target = target.cuda(config['cuda_num']) emb_h_0 = emb_h_0.cuda(config['cuda_num']) model_out_h_0 = model_out_h_0.cuda(config['cuda_num']) con_lens_var = con_lens_var.cuda(config['cuda_num']) c_emb = embedding_layer(context, emb_h_0, context_lengths, step, name='C') if config['question_alone']: q_emb = q_emb_layer(question, emb_h_0, question_lengths, step, name='Q') else: q_emb = embedding_layer(question, emb_h_0, question_lengths, step, q_word_embedding, 'Q') if summary_emb: for i in range(this_batch_num): sentence = '' metadata = [] for tokenId, token in enumerate(context[i]): if tokenId >= context_lengths[i]: break word = config['WordId'].getWord(token[0].data.cpu().numpy()[0]) metadata.append(word) if word != '%PADDING%': sentence += ' ' + word if step == 0 and i == 0: all_emb = c_emb.data.cpu()[:context_lengths[i], i, :] else: all_emb = torch.cat( [all_emb, c_emb.data.cpu()[:context_lengths[i], i, :]], 0) metadata = ['_'.join([word, sentence]) for word in metadata] all_metadata.extend(metadata) G = att_layer(c_emb, q_emb, context_lengths, question_lengths) M = model_layer(model_out_h_0, G, context_lengths, step) if config['not_pretrain']: M_trans = M G_trans = G else: M_trans, G_trans = ner_hw_layer(M, G) # M_trans, G_trans = ner_hw_layer(M, G) prob = ner_out_layer(M_trans, G_trans, context_lengths) # prob = ner_out_layer(M, G, context_lengths) prob_size = prob.size() mask = Variable( get_source_mask(prob_size[0], prob_size[2], prob_size[1], context_lengths)) mask = mask.transpose(0, 1) if config['USE_CUDA']: mask = mask.cuda(context.get_device()) prob = prob * mask crf_mask = Variable( get_target_mask(this_batch_num, max(context_lengths), context_lengths)) if config['USE_CUDA']: crf_mask = crf_mask.type(torch.cuda.ByteTensor) crf_mask = crf_mask.cuda(config['cuda_num']) else: crf_mask = crf_mask.type(torch.ByteTensor) lst_decode = crf( prob.transpose(0, 1).contiguous(), crf_mask, context_lengths) # value, rec_label = torch.max(prob.data, 2) if summary_emb: return lst_decode, all_emb, all_metadata, q_emb[:, 0, :].data.cpu() else: return lst_decode
def eva_one_sentence_vib(encoder, decoder, this_batch): this_batch_num = len(this_batch[3]) this_batch_max_seq = max(this_batch[3]) last_hidden = Variable( torch.zeros(config['decoder_layers'], this_batch_num, config['hidden_size'])) word_input = Variable( torch.zeros(this_batch_num, 1).type(torch.LongTensor)) data = Variable(this_batch[0], volatile=True) target = Variable(this_batch[1], volatile=True) length = Variable(torch.LongTensor(this_batch[3]), volatile=True) h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size'] / 2), volatile=True) # encoder gru initial hidden state if config['USE_CUDA']: last_hidden = last_hidden.cuda(config['cuda_num']) word_input = word_input.cuda(config['cuda_num']) data = data.cuda(config['cuda_num']) target = target.cuda(config['cuda_num']) length = length.cuda(config['cuda_num']) h_0 = h_0.cuda(config['cuda_num']) encoder_outputs = encoder(0, data, h_0, this_batch[3]) # encoder_outputs = encoder_outputs.transpose(1, 2) # encoder_outputs = encoder_outputs.transpose(0, 1) source_mask = Variable( get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3])) if config['USE_CUDA']: source_mask = source_mask.cuda(config['cuda_num']) encoder_outputs = encoder_outputs * source_mask # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['encoder_outputs_size'])) # last_hidden = Variable(torch.randn(config['decoder_layers'], config['batch_size'], config['hidden_size'])) # decoder = BahdanauAttnDecoderRNN(config, config['encoder_outputs_size'], config['hidden_size'], config['decoder_output_size'], config['decoder_layers']) # decoder.load_state_dict(torch.load('net_params.pkl')) # optimizer = torch.optim.Adadelta(decoder.parameters()) # word_input = Variable(torch.LongTensor([[0], [1]])) # if config['USE_CUDA']: # encoder_outputs = encoder_outputs.cuda(config['cuda_num']) # last_hidden = last_hidden.cuda(config['cuda_num']) # word_input = word_input.cuda(config['cuda_num']) # decoder.cuda(config['cuda_num']) # evaluate beam = [{ 'paths': [[], []], 'prob': Variable(torch.zeros(this_batch_num, 1)), 'hidden': Variable( torch.randn(config['decoder_layers'], this_batch_num, config['hidden_size'])) }, {}] # beam_size*batch_size*([path],hidden) beam = [] tag_size = 18 for beam_i in range(tag_size): prob_init = Variable(torch.zeros(this_batch_num, 1)) hidden_init = Variable( torch.zeros(config['decoder_layers'], this_batch_num, config['hidden_size'])) if config['USE_CUDA']: prob_init = prob_init.cuda(config['cuda_num']) hidden_init = hidden_init.cuda(config['cuda_num']) one_beam = {'paths': [], 'prob': prob_init, 'hidden': hidden_init} for batch_i in range(this_batch_num): one_beam['paths'].append([0]) beam.append(one_beam) # beam = [{'paths':[] , 'tails':range(output_size) },{'paths':[] , 'tails':range(output_size)}] # print beam for time_step in range(this_batch_max_seq * 3): # label_prob: (B, output_size) cur_hidden: (num_layers * num_directions, B, hidden) att_weights: (B, 1, S)\ next_prob = [] cur_hidden_lst = [] for i, beam_i in enumerate(beam): word_input = Variable(torch.LongTensor(this_batch_num, 1).zero_()) for batch_i in range(len(beam_i['paths'])): word_input[batch_i, 0] = beam_i['paths'][batch_i][-1] last_hidden = beam_i['hidden'] if config['USE_CUDA']: word_input = word_input.cuda(config['cuda_num']) last_hidden = last_hidden.cuda(config['cuda_num']) # word_input: (batch, 1) last_hidden: (layers * directions, batch, hidden) encoder_outputs: (S, B, hidden) # label_prob: (B, output_size) cur_hidden: (num_layers * num_directions, B, hidden) att_weights: (B, 1, S) label_prob, cur_hidden, attn_weights = decoder( 0, word_input, last_hidden, encoder_outputs) cur_hidden_lst.append(cur_hidden) log_label_prob = F.log_softmax(label_prob) next_prob.append(beam_i['prob'].expand_as(log_label_prob) + log_label_prob) # (batch_size, output_size) # cat = torch.cat(next_prob, 1) # (batch, outputs_size*beam_size) batch_best_indices = [] for batch_i in range(this_batch_num): cat = [prob[batch_i, :].unsqueeze(0) for prob in next_prob] cat = torch.cat(cat, 0) values, indices = cat.topk(1, 0) # indices: (1, tag_size) batch_best_indices.append(indices.data) new_beam = [] for beam_i in range(tag_size): prob_init = Variable(torch.zeros(this_batch_num, 1)) hidden_init = Variable( torch.randn(config['decoder_layers'], this_batch_num, config['hidden_size'])) if config['USE_CUDA']: prob_init = prob_init.cuda(config['cuda_num']) hidden_init = hidden_init.cuda(config['cuda_num']) one_beam = {'paths': [], 'prob': prob_init, 'hidden': hidden_init} new_beam.append(one_beam) for i_batch in range(this_batch_num): for i_beam in range(tag_size): this_beam_num = batch_best_indices[i_batch][0, i_beam] a = beam[this_beam_num]['paths'][i_batch][:] a.append(i_beam) new_beam[i_beam]['paths'].append(a) new_beam[i_beam]['hidden'] = cur_hidden_lst[this_beam_num] new_beam[i_beam]['prob'][i_batch, 0] = next_prob[this_beam_num][i_batch, i_beam] beam = new_beam batch_best_path = [] for i_batch in range(this_batch_num): this_batch_best_path = beam[0]['paths'][i_batch] this_batch_best_prob = beam[0]['prob'][i_batch, 0] for i_beam in range(tag_size): if beam[i_beam]['prob'][i_batch, 0] > this_batch_best_prob: this_batch_best_path = beam[i_beam]['paths'][i_batch] batch_best_path.append(this_batch_best_path) top_path = batch_best_path[0] if top_path.count( config['X'] ) == this_batch_max_seq and top_path[-1] == config['EOS_token']: break if top_path.count(config['X']) > this_batch_max_seq: top_path[-1] = config['EOS_token'] break # print beam[0]['paths'] # (output_size, B, 1) return top_path
def train_iteration(logger, config, my_arg, step, encoder, decoder, encoder_optimizer, decoder_optimizer, this_batch): # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size'])) decoder_optimizer.zero_grad() encoder_optimizer.zero_grad() this_batch_num = len(this_batch[2]) this_batch_max_target = max(this_batch[2]) last_hidden = Variable(torch.zeros(config['decoder_layers']*2, this_batch_num, config['hidden_size'])) word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor)) print 'seq_length', max(this_batch[3]), 'label_length', this_batch_max_target # (output_size, B, 1) data = Variable(this_batch[0]) target = Variable(this_batch[1]) target_length = Variable(torch.LongTensor(this_batch[2])) h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size']/2)) # encoder gru initial hidden state if config['USE_CUDA']: last_hidden = last_hidden.cuda(config['cuda_num']) word_input = word_input.cuda(config['cuda_num']) data = data.cuda(config['cuda_num']) target = target.cuda(config['cuda_num']) target_length = target_length.cuda(config['cuda_num']) h_0 = h_0.cuda(config['cuda_num']) encoder_outputs = encoder(step, data, h_0, this_batch[3]) # encoder_outputs = encoder_outputs.transpose(1,2) # encoder_outputs = encoder_outputs.transpose(0,1) source_mask = Variable(get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3])) if config['USE_CUDA']: source_mask = source_mask.cuda(config['cuda_num']) encoder_outputs = encoder_outputs * source_mask seq_label_prob = decoder(last_hidden, encoder_outputs, this_batch[3]) loss = masked_cross_entropy(seq_label_prob.transpose(0,1).contiguous(), target, target_length) # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length) print 'loss: ', loss.data[0] logger.scalar_summary('loss', loss.data[0], step) loss.backward() e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()] d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()] clip_grad_norm(decoder.parameters(), config['clip_norm']) clip_grad_norm(encoder.parameters(), config['clip_norm']) # for tag, value in encoder.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in decoder.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) decoder_optimizer.step() encoder_optimizer.step() e_after_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()] d_after_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()] for before, after in zip(e_before_step, e_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step) for before, after in zip(d_before_step, d_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step)