def forward(self, query, key, value, mask=None): # query: (batch, num_query, d_embedding) # key: (batch, num_key, d_embedding) # value: (batch, num_value, d_embedding) if mask is not None: # Same padding mask applied to all h heads. mask = mask.unsqueeze(1) batch_size = query.size(0) # linear projection for query, key and value from (batch, num_word, d_embedding) # to (batch, num_head, num_word, d_k) query = self.linears[0](query).view(batch_size, -1, self.head, self.d_k).transpose(1, 2) key = self.linears[1](key).view(batch_size, -1, self.head, self.d_k).transpose(1, 2) value = self.linears[2](value).view(batch_size, -1, self.head, self.d_k).transpose(1, 2) print('query shape in multihead: {}'.format(query.shape)) # Scaled Dot-Product Attention for each batch (batch, heads, num_word, d_k) x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout) # "Concatenate" heads and apply final linear (batch, heads, num_query, d_k) # =>(batch, num_query, d_embedding) x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.head * self.d_k) return self.linears[-1](x)
def forward(self, query, key, value, mask=None): """Implements Figure 2 @param query (tensor(float)]): a tensor of size (batch_size, query_length, embed_size) here embed_size == d_model @param key (tensor(float)]): a tensor of size (batch_size, key_length, embed_size) @param value (tensor(float)): a tensor of size (batch_size, value_length, embed_size) @param mask (tensor(int)): a tensor of size (batch_size, 1, sentence_length) (Not sure about whether it's always that size in dim 1) @returns a result tensor """ if mask is not None: # Same mask applied to all h heads. mask = mask.unsqueeze( 1) # To be broadcastable with attention result nbatches = query.size(0) # 1) Do all the linear projections in batch from d_model => h x d_k query, key, value = \ [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) # (batch, h, sent_len, d_k) for l, x in zip(self.linears, (query, key, value))] # 2) Apply attention on all the projected vectors in batch. x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout) # 3) "Concat" using a view and apply a final linear. x = x.transpose(1, 2).contiguous() \ .view(nbatches, -1, self.h * self.d_k) return self.linears[-1](x)
def forward(self, query, key, value, mask=None): ''' 其中key 和 value的 size 一定相同 :param query: [batch_size,q_len] :param key: [batch_size,k_len] :param value: [batch_size,k_len] :param mask: [batch_size,q_len,k_len] :return: context_vec: [batch_size,q_len,k_len] ''' batch_size = query.size(0) query = self.query_linear(query) key = self.key_linear(key) value = self.value_linear(value) # split by heads # [batch_size * head,seq_len,d_k] query = query.view(batch_size * self.h, -1, self.d_k) key = key.view(batch_size * self.h, -1, self.d_k) value = value.view(batch_size * self.h, -1, self.d_k) if mask is not None: # [batch_size * h,q_len,k_len] mask = mask.expand(self.h, -1, -1) # context_vec [batch_size * h,q_len,d_k] context_vec, self.attn = attention(query, key, value, mask, self.dropout) context_vec = context_vec.contiguous().view(batch_size, -1, self.double() * self.h) return self.proj_linear(context_vec)
def forward(self, query, key, value, mask=None): # query.size() = key.size() = value.size() = (batch_size, max_len, d_model) if mask is not None: mask = mask.unsqueeze(1) batch_size = query.size(0) """ do all the linear projection, after this operation query.size() = key.size() = value.size() = (batch_size, self.h, max_len, self.d_k) """ query, key, value = \ [linear(x).view(batch_size, -1, self.h, self.d_k).transpose(1, 2) for linear, x in zip(self.linears, (query, key, value))] """ x.size(): (batch_size, h, max_len, d_v) self.attn.size(): (batch_size, h, max_len, d_v) """ x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout) """ x.transpose(1,2).size(): (batch_size, max_len, h, d_v) the transpose operation is necessary x.size: (batch_size, max_len, h*d_v) """ x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k) # self.linears[-1] \in R^{hd_v \times d_{model}} return self.linears[-1](x)
def forward(self, query, key, value, mask = None): if mask is not None: # Same mask applied to all h heads. mask = mask.unsqueeze(1) nbatches = query.size(0) # 1) Do all the linear projections in batch from d_model => h x d_k query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) for l, x in zip(self.linears, (query, key, value))] # 2) Apply attention on all the projected vectors in batch. x, self.attn = attention( query, key, value, mask = mask, dropout = self.dropout ) # 3) "Concat" using a view and apply a final linear. x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k) x = self.linears[-1](x) return x
def alstm_layer(self, inputs, lengths, state_size, keep_prob=1.0, scope = 'lstm-layer', reuse=False): with tf.variable_scope(scope, reuse=reuse): cell = tf.contirb.rnn.DropoutWrapper( tf.contirb.rnn.LSTMCell( state_size, reuse=reuse ), output_keep_prob=keep_prob ) outputs, output_state = tf.nn.dynamic_rnn( inputs=inputs, cell=cell, sequence_length=lengths, dtype=tf.float32 ) outputs = attention(outputs, self.attention_size, time_major=False, return_alphas=False): return outputs
def decoder(inputs, memory, is_training=True, scope="decoder"): """ A content-based tanh attention decoder using a stack of GRUs with vertical residual connections. Takes the output from the encoder, runs it though a prenet, then processes with attention. After finishing the attention, generates the decoder RNN. Although the decoder could directly target the raw spectogram, this would be a highly redundant representation for the purpose of learning alignment between speech signal and text. Thus the target is an 80-band mel-scale spectogram, though fewer bands or more concise targets such as cepstrum could be used. :param inputs: :param memory: :param is_training: :param scope: :return: """ with tf.variable_scope(scope): # prenet inputs = pre_net(inputs, is_training=is_training) # With Attention outputs, state = attention(inputs, memory, num_units=EMBED_SIZE) # Transpose alignments = tf.transpose(state.alignment_history.stack(), [1, 2, 0]) # Decoder RNNs - 2-Layer Residual GRU (256 cells) outputs += decoder_rnn(outputs, scope="decoder_rnn1") outputs += decoder_rnn(outputs, scope="decoder_rnn2") # An 80-band mel-scale spectogram is the target mel_hats = tf.layers.dense(outputs, N_MELS * REDUCTION_FACTOR) return mel_hats, alignments
def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, attention_dim, rnn_size, num_rnn_layers, num_classes, max_grad_norm, dropout=1., l2_reg_lambda=0.0, adjust_weight=False, label_weight=[], is_training=True): # define input variable self.keep_prob = dropout self.batch_size = batch_size self.embeddings = embeddings self.embedding_size = embedding_size self.attention_dim = attention_dim self.num_classes = num_classes self.adjust_weight = adjust_weight self.label_weight = label_weight self.rnn_size = rnn_size self.num_rnn_layers = num_rnn_layers self.num_unroll_steps = num_unroll_steps self.l2_reg_lambda = l2_reg_lambda self.max_grad_norm = max_grad_norm self.is_training = is_training self.input_data = tf.placeholder(tf.int32, [None, self.num_unroll_steps]) self.target = tf.placeholder(tf.int64, [None]) self.mask_x = tf.placeholder(tf.float32, [self.num_unroll_steps, None]) #build BILSTM network # forward rnn #fw_gru_cell = tf.nn.rnn_cell.GRUCell(self.rnn_size) fw_gru_cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size) #if self.is_training and self.keep_prob < 1: # fw_gru_cell = tf.nn.rnn_cell.DropoutWrapper( # fw_gru_cell, input_keep_prob=self.keep_prob, output_keep_prob = self.keep_prob # ) fw_cell = tf.nn.rnn_cell.MultiRNNCell([fw_gru_cell] * self.num_rnn_layers, state_is_tuple=True) # backforward rnn #bw_gru_cell = tf.nn.rnn_cell.GRUCell(self.rnn_size) bw_gru_cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size) #if self.is_training and self.keep_prob < 1: # bw_gru_cell = tf.nn.rnn_cell.DropoutWrapper( # bw_gru_cell, input_keep_prob=self.keep_prob, output_keep_prob = self.keep_prob # ) bw_cell = tf.nn.rnn_cell.MultiRNNCell([bw_gru_cell] * self.num_rnn_layers, state_is_tuple=True) #embedding layer with tf.device("/cpu:0"), tf.name_scope("embedding_layer"): self.embeddings = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="embeddings") inputs = tf.nn.embedding_lookup(self.embeddings, self.input_data) # dropout if self.is_training and self.keep_prob < 1: inputs = tf.nn.dropout(inputs, self.keep_prob) inputs = [ tf.squeeze(input, [1]) for input in tf.split(1, self.num_unroll_steps, inputs) ] out_put, _, _ = tf.nn.bidirectional_rnn(fw_cell, bw_cell, inputs, dtype=tf.float32) out_put = tf.transpose(out_put, perm=[1, 0, 2]) #(batch_size, steps, rnn_size*2) output = attention(out_put, self.attention_dim, self.l2_reg_lambda) #output = tf.squeeze(out_put[:, -1, :]) # dropout if self.is_training and self.keep_prob < 1: output = tf.nn.dropout(output, self.keep_prob) #out_put = out_put * self.mask_x[:,:,None] #with tf.name_scope("mean_pooling_layer"): # out_put = tf.reduce_sum(out_put,0)/(tf.reduce_sum(self.mask_x,0)[:,None]) with tf.name_scope("Softmax_layer_and_output"): softmax_w = tf.get_variable( "softmax_w", initializer=tf.truncated_normal( [2 * self.rnn_size, self.num_classes], stddev=0.1)) softmax_b = tf.get_variable("softmax_b", initializer=tf.constant(0., shape=[1])) self.logits = tf.matmul(output, softmax_w) + softmax_b if self.l2_reg_lambda > 0: l2_loss += tf.nn.l2_loss(softmax_w) l2_loss += tf.nn.l2_loss(softmax_b) weight_decay = tf.mul(l2_loss, self.l2_reg_lambda, name='l2_loss') tf.add_to_collection('losses', weight_decay) with tf.name_scope("loss"): self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( self.logits, self.target) tf.add_to_collection('losses', self.loss) total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss') #self.cost = tf.reduce_mean(self.loss) self.cost = tf.reduce_mean(total_loss) with tf.name_scope("accuracy"): self.prediction = tf.argmax(self.logits, 1) correct_prediction = tf.equal(self.prediction, self.target) self.correct_num = tf.reduce_sum( tf.cast(correct_prediction, tf.float32)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy") #add summary loss_summary = tf.scalar_summary("loss", self.cost) #add summary accuracy_summary = tf.scalar_summary("accuracy_summary", self.accuracy) if not is_training: return self.globle_step = tf.Variable(0, name="globle_step", trainable=False) self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.max_grad_norm) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in zip(grads, tvars): if g is not None: grad_hist_summary = tf.histogram_summary( "{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary( "{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) self.grad_summaries_merged = tf.merge_summary(grad_summaries) self.summary = tf.merge_summary( [loss_summary, accuracy_summary, self.grad_summaries_merged]) #optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) optimizer.apply_gradients(zip(grads, tvars)) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self.lr, self.new_lr)
def test(args): # Initialize the network model = baseline(args.side) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) model.load_state_dict(torch.load(args.model_root)) print(model) model.eval() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) outdir = args.out_dir if args.is_savemaps: hook_conv5 = SimpleHook(model.layer4) # Initialize DataLoader Dataset = BatchLoader( imageRoot=args.imageroot, gtRoot=args.gtroot, reasonRoot=args.reasonroot, ) dataloader = DataLoader(Dataset, batch_size=int(args.batch_size), num_workers=0, shuffle=False) AccuracyArr = [] AccOverallArr = [] RandomAcc = [] ReasonAcc = [] SaveFilename = (outdir + 'TestingLog.txt') TestingLog = open(SaveFilename, 'w') print('Save to ', SaveFilename) TestingLog.write(str(args) + '\n') for i, dataBatch in enumerate(dataloader): # Read data img_cpu = dataBatch['img'] imBatch = img_cpu.to(device) ori_img_cpu = dataBatch['ori_img'] target_cpu = dataBatch['target'] targetBatch = target_cpu.to(device) if args.side: reason_cpu = dataBatch['reason'] reasonBatch = reason_cpu.to(device) # Prediction pred, pred_reason = model(imBatch) else: pred = model(imBatch) if args.is_savemaps: hooked_features = hook_conv5.output.data hooked_features = torch.mean(torch.mean(hooked_features, dim=0), dim=0) # print(hooked_features.shape) new_img = attention( ori_img_cpu.squeeze(0).data.numpy(), hooked_features.cpu().data.numpy()) plt.imsave((outdir + 'att_maps/' + str(i) + '.jpg'), new_img) # Calculate accuracy predict = torch.sigmoid(pred) > 0.5 f1 = f1_score(target_cpu.data.numpy(), predict.cpu().data.numpy(), average=None) f1_overall = f1_score(target_cpu.data.numpy(), predict.cpu().data.numpy(), average='samples') predict_reason = torch.sigmoid(pred_reason) > 0.5 f1_reason = f1_score(reason_cpu.data.numpy(), predict_reason.cpu().data.numpy(), average='samples') # print("f1 score:{}".format(f1)) AccuracyArr.append(f1) AccOverallArr.append(f1_overall) # print(AccuracyArr) ReasonAcc.append(f1_reason) # random guess random = torch.randint(0, 2, (args.batch_size, 5)) random[:, 4] = 0 random_f1 = f1_score(target_cpu.data.numpy(), random.cpu().data.numpy(), average=None) RandomAcc.append(random_f1) print('prediction logits:', pred) print('prediction action: \n {}'.format(predict)) print('ground truth: \n', targetBatch.cpu().data.numpy()) print('Iteration {}: F1 {} Accumulated F1 {}'.format( i, AccuracyArr[-1], np.mean(np.array(AccuracyArr), axis=0))) TestingLog.write('prediction logits:' + str(pred) + '\n') TestingLog.write('prediction action: \n {}'.format(predict) + '\n') TestingLog.write('ground truth: \n' + str(targetBatch.cpu().data.numpy()) + '\n') TestingLog.write('Iteration {}: F1 {} Accumulated F1 {}'.format( i, AccuracyArr[-1], np.mean(np.array(AccuracyArr), axis=0)) + '\n') TestingLog.write('\n') print("Random guess acc:{}".format(np.mean(np.array(RandomAcc), axis=0))) print("Overall acc:{}".format(np.mean(np.array(AccOverallArr), axis=0))) print("Reason acc:{}".format(np.mean(np.array(ReasonAcc), axis=0))) TestingLog.write( "Random guess acc:{}".format(np.mean(np.array(RandomAcc), axis=0)) + '\n') TestingLog.write( "Overall acc:{}".format(np.mean(np.array(AccOverallArr), axis=0)) + '\n') TestingLog.close()
def test(cfg, args): # torch.cuda.set_device(5) # Initialize the network model = build_detection_model(cfg) print(model) model.eval() #print(model) # model load weights model.load_state_dict(torch.load(args.model_root)) # model.load_state_dict(torch.load(cfg.MODEL.WEIGHT)) device = torch.device(cfg.MODEL.DEVICE) model.to(device) outdir = os.path.join(cfg.OUTPUT_DIR, 'inference/') if not os.path.exists(outdir): os.makedirs(outdir) # if args.is_savemaps: # print(model.predictor) # hook_conv5 = SimpleHook(model.predictor.relu_glob1) # Initialize DataLoader Dataset = BatchLoader(imageRoot=args.imageroot, gtRoot=args.gtroot, reasonRoot=args.reasonroot, cropSize=(args.imHeight, args.imWidth)) dataloader = DataLoader(Dataset, batch_size=int(args.batch_size), num_workers=24, shuffle=False) AccOverallArr = [] TargetArr = [] PredArr = [] RandomArr = [] AccOverallReasonArr = [] TargetReasonArr = [] PredReasonArr = [] RandomReasonArr = [] SaveFilename = (outdir + 'TestingLog.txt') TestingLog = open(SaveFilename, 'w') print('Save to ', SaveFilename) TestingLog.write(str(args) + '\n') count = dataloader.__len__() for i, dataBatch in enumerate(dataloader): print('Finished: {} / {}'.format(i, count)) print('Finished: %.2f%%' % (i / count * 100)) # Read data with torch.no_grad(): img_cpu = dataBatch['img'] imBatch = img_cpu.to(device) ori_img_cpu = dataBatch['ori_img'] target_cpu = dataBatch['target'] targetBatch = target_cpu.to(device) if cfg.MODEL.SIDE: reason_cpu = dataBatch['reason'] reasonBatch = reason_cpu.to(device) if not args.is_savemaps: pred, pred_reason = model(imBatch) else: hook_conv5 = SimpleHook(model.predictor.relu_glob1) pred, pred_reason, selected_boxes = model(imBatch) else: if not args.is_savemaps: pred = model(imBatch) else: hook_conv5 = SimpleHook(model.predictor.relu_glob1) pred, selected_boxes = model(imBatch) # if i == 0: # estimate the model size # modelsize(model, imBatch) # pred, selected_boxes = model(imBatch) # DrawBbox(ori_img_cpu[0], selected_boxes[0], outdir, i) # torch.cuda.empty_cache() if args.is_savemaps: hooked_features = hook_conv5.output.data print("hooked_feature:", hooked_features.shape) hooked_features = torch.mean(torch.mean(hooked_features, dim=0), dim=0) new_img = attention( ori_img_cpu.squeeze(0).data.numpy(), hooked_features.cpu().data.numpy(), [hooked_features.shape[-1], hooked_features.shape[-1]]) # plt.imsave((outdir + 'att_maps/' + str(i) + '_att.jpg'), new_img) DrawBbox(new_img, selected_boxes[0], outdir, i) # Calculate accuracy predict = torch.sigmoid(pred) > 0.5 # torch.cuda.empty_cache() # print(target_cpu.data.numpy().shape) # print(predict.cpu().data.numpy().shape) TargetArr.append(target_cpu.data.numpy()) PredArr.append(predict.cpu().data.numpy()) f1_overall = f1_score(target_cpu.data.numpy(), predict.cpu().data.numpy(), average='samples') AccOverallArr.append(f1_overall) # random guess random = np.random.randint(0, 2, (predict.shape[0], predict.shape[1])) RandomArr.append(random) if cfg.MODEL.SIDE: predict_reason = torch.sigmoid(pred_reason) > 0.5 TargetReasonArr.append(reason_cpu.data.numpy()) PredReasonArr.append(predict_reason.cpu().data.numpy()) f1_overall = f1_score(reason_cpu.data.numpy(), predict_reason.cpu().data.numpy(), average='samples') AccOverallReasonArr.append(f1_overall) # random guess random = np.random.randint( 0, 2, (predict_reason.shape[0], predict_reason.shape[1])) RandomReasonArr.append(random) print('prediction logits:', pred) print('prediction action: \n {}'.format(predict)) print('ground truth: \n', targetBatch.cpu().data.numpy()) print('Accumulated Overall Action acc: ', np.mean(AccOverallArr)) TestingLog.write('Iter ' + str(i) + '\n') TestingLog.write('prediction logits:' + str(pred) + '\n') TestingLog.write('prediction action: \n {}'.format(predict) + '\n') TestingLog.write('ground truth: \n' + str(targetBatch.cpu().data.numpy()) + '\n') if cfg.MODEL.SIDE: print('prediction reason: \n {}'.format(predict_reason)) print('ground truth: \n', reason_cpu.data.numpy()) print('Accumulated Overall Reason acc: ', np.mean(AccOverallReasonArr)) TestingLog.write( 'prediction reason: \n {}'.format(predict_reason) + '\n') TestingLog.write('ground truth: \n' + str(reason_cpu.data.numpy()) + '\n') TestingLog.write('\n') TargetArr = List2Arr(TargetArr) PredArr = List2Arr(PredArr) RandomArr = List2Arr(RandomArr) print(TargetArr) print(PredArr) f1_pred = f1_score(TargetArr, PredArr, average=None) f1_rand = f1_score(TargetArr, RandomArr, average=None) # print("Random guess acc:{}".format(np.mean(np.array(RandomAcc),axis=0))) print("Action Random guess acc:{}".format(f1_rand)) print("Action Category Acc:{}".format(f1_pred)) print("Action Average Acc:{}".format(np.mean(f1_pred))) print("Action Overall acc:{}".format( np.mean(np.array(AccOverallArr), axis=0))) TestingLog.write("Action Random guess acc:{}".format(f1_rand)) TestingLog.write("Action Category Acc:{}".format(f1_pred)) TestingLog.write("Action Average Acc:{}".format(np.mean(f1_pred))) TestingLog.write("Action Overall acc:{}".format( np.mean(np.array(AccOverallArr), axis=0))) if cfg.MODEL.SIDE: TargetReasonArr = List2Arr(TargetReasonArr) PredReasonArr = List2Arr(PredReasonArr) RandomReasonArr = List2Arr(RandomReasonArr) f1_pred_reason = f1_score(TargetReasonArr, PredReasonArr, average=None) f1_pred_rand = f1_score(TargetReasonArr, RandomReasonArr, average=None) print("Reason Random guess acc:{}".format(f1_pred_rand)) print("Reason Category Acc:{}".format(f1_pred_reason)) print("Reason Average Acc:{}".format(np.mean(f1_pred_reason))) print("Reason Overall Acc:{}".format( np.mean(np.array(AccOverallReasonArr), axis=0))) TestingLog.write("Reason Random guess acc:{}".format(f1_pred_rand)) TestingLog.write("Reason Category Acc:{}".format(f1_pred_reason)) TestingLog.write("Reason Average Acc:{}".format( np.mean(f1_pred_reason))) TestingLog.write("Reason Overall Acc:{}".format( np.mean(np.array(AccOverallReasonArr), axis=0)))
def __init__(self, batch_size, num_unroll_steps, embeddings, embedding_size, attention_dim, rnn_size, num_rnn_layers, num_classes, max_grad_norm, dropout = 1., l2_reg_lambda=0.0, adjust_weight=False,label_weight=[],is_training=True): # define input variable self.keep_prob = dropout self.batch_size = batch_size self.embeddings = embeddings self.embedding_size = embedding_size self.attention_dim = attention_dim self.num_classes = num_classes self.adjust_weight = adjust_weight self.label_weight = label_weight self.rnn_size = rnn_size self.num_rnn_layers = num_rnn_layers self.num_unroll_steps = num_unroll_steps self.l2_reg_lambda = l2_reg_lambda self.max_grad_norm = max_grad_norm self.is_training = is_training self.input_data=tf.placeholder(tf.int32,[None,self.num_unroll_steps]) self.target = tf.placeholder(tf.int64,[None]) self.mask_x = tf.placeholder(tf.float32,[self.num_unroll_steps,None]) #build BILSTM network # forward rnn #fw_lstm_cell = tf.nn.rnn_cell.GRUCell(self.rnn_size) fw_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size) #if self.is_training and self.keep_prob < 1: # fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( # fw_lstm_cell, input_keep_prob=self.keep_prob, output_keep_prob = self.keep_prob # ) fw_cell = tf.nn.rnn_cell.MultiRNNCell([fw_lstm_cell] * self.num_rnn_layers, state_is_tuple=True) # backforward rnn #bw_lstm_cell = tf.nn.rnn_cell.GRUCell(self.rnn_size) bw_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.rnn_size) #if self.is_training and self.keep_prob < 1: # bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper( # bw_lstm_cell, input_keep_prob=self.keep_prob, output_keep_prob = self.keep_prob # ) bw_cell = tf.nn.rnn_cell.MultiRNNCell([bw_lstm_cell] * self.num_rnn_layers, state_is_tuple=True) #embedding layer with tf.device("/cpu:0"),tf.name_scope("embedding_layer"): self.embeddings = tf.Variable(tf.to_float(self.embeddings), trainable=True, name="embeddings") inputs=tf.nn.embedding_lookup(self.embeddings, self.input_data) # dropout if self.is_training and self.keep_prob < 1: inputs = tf.nn.dropout(inputs, self.keep_prob) inputs = [tf.squeeze(input, [1]) for input in tf.split(1, self.num_unroll_steps, inputs)] out_put, _, _ = tf.nn.bidirectional_rnn(fw_cell, bw_cell, inputs, dtype=tf.float32) out_put = tf.transpose(out_put, perm=[1, 0, 2])#(batch_size, steps, rnn_size*2) output = attention(out_put, self.attention_dim, self.l2_reg_lambda) #output = tf.squeeze(out_put[:, -1, :]) # dropout if self.is_training and self.keep_prob < 1: output = tf.nn.dropout(output, self.keep_prob) #out_put = out_put * self.mask_x[:,:,None] #with tf.name_scope("mean_pooling_layer"): # out_put = tf.reduce_sum(out_put,0)/(tf.reduce_sum(self.mask_x,0)[:,None]) with tf.name_scope("Softmax_layer_and_output"): softmax_w = tf.get_variable("softmax_w", initializer=tf.truncated_normal([2 * self.rnn_size, self.num_classes], stddev=0.1)) softmax_b = tf.get_variable("softmax_b", initializer=tf.constant(0., shape=[1])) self.logits = tf.matmul(output, softmax_w) + softmax_b #if self.l2_reg_lambda>0: # l2_loss += tf.nn.l2_loss(softmax_w) # l2_loss += tf.nn.l2_loss(softmax_b) # weight_decay = tf.mul(l2_loss, self.l2_reg_lambda, name='l2_loss') # tf.add_to_collection('losses', weight_decay) with tf.name_scope("loss"): self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, self.target) #tf.add_to_collection('losses', self.loss) #total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss') self.cost = tf.reduce_mean(self.loss) with tf.name_scope("accuracy"): self.prediction = tf.argmax(self.logits,1) correct_prediction = tf.equal(self.prediction,self.target) self.correct_num=tf.reduce_sum(tf.cast(correct_prediction,tf.float32)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32),name="accuracy") #add summary loss_summary = tf.scalar_summary("loss",self.cost) #add summary accuracy_summary=tf.scalar_summary("accuracy_summary",self.accuracy) if not is_training: return self.globle_step = tf.Variable(0,name="globle_step",trainable=False) self.lr = tf.Variable(0.0,trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.max_grad_norm) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in zip(grads, tvars): if g is not None: grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g) sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) self.grad_summaries_merged = tf.merge_summary(grad_summaries) self.summary =tf.merge_summary([loss_summary,accuracy_summary,self.grad_summaries_merged]) #optimizer = tf.train.GradientDescentOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) optimizer.apply_gradients(zip(grads, tvars)) self.train_op=optimizer.apply_gradients(zip(grads, tvars)) self.new_lr = tf.placeholder(tf.float32,shape=[],name="new_learning_rate") self._lr_update = tf.assign(self.lr,self.new_lr)