def __init__(self, n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False): super(Model_DIN_V2_Gru_att_Gru, self).__init__(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling) # RNN layer(-s) with tf.name_scope('rnn_1'): rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE), inputs=self.item_his_eb, sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru1") tf.summary.histogram('GRU_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer_1'): att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, ATTENTION_SIZE, self.mask, softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True) tf.summary.histogram('alpha_outputs', alphas) with tf.name_scope('rnn_2'): rnn_outputs2, final_state2 = dynamic_rnn( GRUCell(HIDDEN_SIZE), inputs=att_outputs, sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru2") tf.summary.histogram('GRU2_Final_State', final_state2) inp = tf.concat([ self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2 ], 1) # Fully connected layer self.build_fcn_net(inp, use_dice=True)
def add_sentence_summaries(self, wordvector_embed_size): if self.config.bidirectional_sentences: forwardcell = DropoutWrapper(GRUCell(wordvector_embed_size), self.dropout_placeholder, self.dropout_placeholder) backwardcell = DropoutWrapper(GRUCell(wordvector_embed_size), self.dropout_placeholder, self.dropout_placeholder) _, statefw, statebw = bidirectional_dynamic_rnn(forwardcell, backwardcell, self.embedded_lines, self.line_length_placeholder, dtype = tf.float32, scope = "LineRNN") # self.sentence_summaries = tf.concat(1, [statefw, statebw]) self.sentence_summaries = tf.concat([statefw, statebw], 1) return 2*wordvector_embed_size else: rnncell = DropoutWrapper(GRUCell(wordvector_embed_size), self.dropout_placeholder, self.dropout_placeholder) _, self.sentence_summaries = tf.nn.dynamic_rnn(rnncell, self.embedded_lines, self.line_length_placeholder, dtype = tf.float32, scope = "LineRNN") return wordvector_embed_size
def build_model(self): with tf.variable_scope("inferring_module"): rdim = 768 update_num = 2 batch_size = tf.shape(self.sent1)[0] dim = self.sent1.get_shape().as_list()[-1] sr_cell = GRUCell(num_units=rdim, activation=tf.nn.relu) r_cell = sr_cell tri_cell = TriangularCell(num_units=rdim, r_cell=r_cell, sent1=self.sent1, sent2=self.sent2, sent3=self.sent3, sent1_length=39, sent2_length=110, sent3_length=152, dim=dim, use_bias=False, activation=tf.nn.relu, sent1_mask=self.sent1_mask, sent2_mask=self.sent2_mask, sent3_mask=self.sent3_mask, initializer=None, dtype=tf.float32) fake_input = tf.tile(tf.expand_dims(self.mark0, axis=1), [1, update_num, 1]) self.init_state = tri_cell.zero_state(batch_size=batch_size, dtype=tf.float32) self.double_output, last_state = dynamic_rnn( cell=tri_cell, inputs=fake_input, initial_state=self.init_state) r1_output, r2_output, r3_output = last_state[3:] # (B, dim) temp13 = tf.concat([r1_output, r3_output, r1_output * r3_output], axis=1) temp23 = tf.concat([r2_output, r3_output, r2_output * r3_output], axis=1) temp13 = dropout(temp13, self.dropout_rate) temp23 = dropout(temp23, self.dropout_rate) r13 = tf.layers.dense(temp13, 768, activation=tf.tanh, kernel_initializer=create_initializer(0.02)) r23 = tf.layers.dense(temp23, 768, activation=tf.tanh, kernel_initializer=create_initializer(0.02)) temp = tf.concat([self.mark0, r13, r23], axis=1) refer_output = tf.layers.dense( temp, 768, activation=None, kernel_initializer=create_initializer(0.02)) return refer_output
def __init__(self, n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=True): super(Model_DIN_V2_Gru_Vec_attGru_Neg, self).__init__(n_uid, n_mid, n_cat,EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,use_negsampling) with tf.name_scope('rnn_1'): rnn_outputs,_ = dynamic_rnn(GRUCell(HIDDEN_SIZE),inputs = self.item_his_eb,sequence_length=self.seq_len_ph,dtype=tf.float32,scope='gru1') tf.summary.histogram("GRU_outputs",rnn_outputs) aux_loss_1 = self.auxiliary_loss(rnn_outputs[:,:-1,:],self.item_his_eb[:,1:,:],self.noclk_item_his_eb[:,1:,:],self.mask[:,1:],stag="gru") self.aux_loss = aux_loss_1 with tf.name_scope('Attention_layer_1'): att_outputs,alphas = din_fcn_attention(self.item_eb,rnn_outputs,ATTENTION_SIZE,self.mask, softmax_stag=1,stag='1_1',mode='LIST',return_alphas=True) tf.summary.histogram('alpha_outputs',alphas) with tf.name_scope('rnn_2'): rnn_outputs2,final_state2 = dynamic_rnn(VecAttGRUCell(HIDDEN_SIZE),inputs=rnn_outputs, att_scores=tf.expand_dims(alphas,-1), sequence_length = self.seq_len_ph,dtype=tf.float32, scope="gru2" ) tf.summary.histogram("GRU2_Final_State",final_state2) inp = tf.concat([self.uid_batch_embedded,self.item_eb,self.item_his_eb_sum,self.item_eb * self.item_his_eb_sum,final_state2],1) self.build_fcn_net(inp,use_dice=True)
def RNN(_X, _weights, _biases, lens): if FLAGS.unit == "PLSTM": cell = PhasedLSTMCell(FLAGS.n_hidden, use_peepholes=True, state_is_tuple=True) elif FLAGS.unit == "GRU": cell = GRUCell(FLAGS.n_hidden) elif FLAGS.unit == "LSTM": cell = LSTMCell(FLAGS.n_hidden, use_peepholes=True, state_is_tuple=True) else: raise ValueError("Unit '{}' not implemented.".format(FLAGS.unit)) outputs = multiPLSTM(_X, lens, FLAGS.n_layers, FLAGS.n_hidden, n_input) outputs = tf.slice(outputs, [0, 0, 0], [-1, -1, FLAGS.n_hidden]) # TODO better (?) in lack of smart indexing batch_size = tf.shape(outputs)[0] max_len = tf.shape(outputs)[1] out_size = int(outputs.get_shape()[2]) index = tf.range(0, batch_size) * max_len + (lens - 1) flat = tf.reshape(outputs, [-1, out_size]) relevant = tf.gather(flat, index) return tf.nn.bias_add(tf.matmul(relevant, _weights['out']), _biases['out'])
def __init__(self, feature_size, eb_dim, hidden_size, max_time_len, user_fnum, item_fnum, emb_initializer): super(DIEN, self).__init__(feature_size, eb_dim, hidden_size, max_time_len, user_fnum, item_fnum, emb_initializer) mask = tf.sequence_mask(self.user_seq_length_ph, max_time_len, dtype=tf.float32) # attention RNN layer with tf.name_scope('rnn_1'): user_seq_ht, _ = tf.nn.dynamic_rnn( GRUCell(hidden_size), inputs=self.user_seq, sequence_length=self.user_seq_length_ph, dtype=tf.float32, scope='gru1') with tf.name_scope('attention'): atten_score, _, = self.attention(user_seq_ht, user_seq_ht, self.target_item, mask) with tf.name_scope('rnn_2'): _, seq_rep = dynamic_rnn(VecAttGRUCell(hidden_size), inputs=user_seq_ht, att_scores=atten_score, sequence_length=self.user_seq_length_ph, dtype=tf.float32, scope="argru1") inp = tf.concat([seq_rep, self.target_user, self.target_item], axis=1) # fully connected layer self.build_fc_net(inp) self.build_logloss()
def __init__(self, n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=True): super(Model_DIN_V2_Gru_Vec_attGru_Neg, self).__init__(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling) # RNN layer(-s) 第一层GRU 将用户行为历史的item embedding输入到dynamic rnn中,同时计算辅助loss with tf.name_scope('rnn_1'): rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE), inputs=self.item_his_eb, sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru1") tf.summary.histogram('GRU_outputs', rnn_outputs) # 辅助loss的计算其实是一个二分类模型,代码如下: aux_loss_1 = self.auxiliary_loss(rnn_outputs[:, :-1, :], self.item_his_eb[:, 1:, :], self.noclk_item_his_eb[:, 1:, :], self.mask[:, 1:], stag="gru") self.aux_loss = aux_loss_1 # Attention layer with tf.name_scope('Attention_layer_1'): att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, ATTENTION_SIZE, self.mask, softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True) tf.summary.histogram('alpha_outputs', alphas) with tf.name_scope('rnn_2'): rnn_outputs2, final_state2 = dynamic_rnn(VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs, att_scores = tf.expand_dims(alphas, -1), #计算兴趣的进化过程 sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru2") tf.summary.histogram('GRU2_Final_State', final_state2) inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2], 1) #最后我们通过一个多层神经网络,得到最终的ctr预估值 self.build_fcn_net(inp, use_dice=True)
def __init__(self, num_units, memory, pmemory, cell_type='lstm'): super(AttentionCell, self).__init__() self._cell = LSTMCell(num_units) if cell_type == 'lstm' else GRUCell(num_units) self.num_units = num_units self.memory = memory self.pmemory = pmemory self.mem_units = memory.get_shape().as_list()[-1]
def __init__(self, feature_size, eb_dim, hidden_size, max_time_len, user_fnum, item_fnum, emb_initializer): super(HPMN, self).__init__(feature_size, eb_dim, hidden_size, max_time_len, user_fnum, item_fnum, emb_initializer) self.layer_num = 3 self.split_by = 2 self.memory = [] with tf.name_scope('rnn'): inp = self.user_seq length = max_time_len for i in range(self.layer_num): user_seq_ht, user_seq_final_state = tf.nn.dynamic_rnn( GRUCell(hidden_size), inputs=inp, dtype=tf.float32, scope='GRU%s' % i) user_seq_final_state = tf.expand_dims(user_seq_final_state, 1) self.memory.append(user_seq_final_state) length = int(length / self.split_by) user_seq_ht = tf.reshape( user_seq_ht, [-1, length, self.split_by, hidden_size]) inp = tf.reshape( tf.gather(user_seq_ht, [self.split_by - 1], axis=2), [-1, length, hidden_size]) self.memory = tf.concat(self.memory, axis=1) _, output = self.attention(self.memory, self.memory, self.target_item) self.repre = tf.concat([self.target_user, self.target_item, output], axis=1) self.build_fc_net(self.repre) self.build_loss()
def __init__(self, n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=256): super(Model_GRU4REC, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN, Flag="GRU4REC") with tf.name_scope('rnn_1'): self.sequence_length = tf.Variable([SEQ_LEN] * BATCH_SIZE) rnn_outputs, final_state1 = dynamic_rnn( GRUCell(2 * EMBEDDING_DIM), inputs=self.item_his_eb, sequence_length=self.sequence_length, dtype=tf.float32, scope="gru1") tf.summary.histogram('GRU_outputs', rnn_outputs) inp = tf.concat([self.item_eb, self.item_his_eb_sum, final_state1], 1) self.build_fcn_net(inp, use_dice=False)
def __init__(self, n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=400, use_negsample=False): super(Model_DIEN, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN, use_negsample, Flag="DIEN") with tf.name_scope('rnn_1'): self.sequence_length = tf.Variable([SEQ_LEN] * BATCH_SIZE) rnn_outputs, _ = dynamic_rnn(GRUCell(2*EMBEDDING_DIM), inputs=self.item_his_eb, sequence_length=self.sequence_length, dtype=tf.float32, scope="gru1") tf.summary.histogram('GRU_outputs', rnn_outputs) if use_negsample: aux_loss_1 = self.auxiliary_loss(rnn_outputs[:, :-1, :], self.item_his_eb[:, 1:, :], self.neg_his_eb[:, 1:, :], self.mask[:, 1:], stag = "bigru_0") self.aux_loss = aux_loss_1 # Attention layer with tf.name_scope('Attention_layer_1'): att_outputs, alphas = din_attention(self.item_eb, rnn_outputs, HIDDEN_SIZE, mask=self.mask, mode="LIST", return_alphas=True) tf.summary.histogram('alpha_outputs', alphas) with tf.name_scope('rnn_2'): rnn_outputs2, final_state2 = dynamic_rnn(VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs, att_scores = tf.expand_dims(alphas, -1), sequence_length=self.sequence_length, dtype=tf.float32, scope="gru2") tf.summary.histogram('GRU2_Final_State', final_state2) inp = tf.concat([self.item_eb, final_state2, self.item_his_eb_sum, self.item_eb*self.item_his_eb_sum], 1) self.build_fcn_net(inp, use_dice=False)
def __init__(self, n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN=256): super(Model_ARNN, self).__init__(n_uid, n_mid, EMBEDDING_DIM, HIDDEN_SIZE, BATCH_SIZE, SEQ_LEN, Flag="ARNN") with tf.name_scope('rnn_1'): self.sequence_length = tf.Variable([SEQ_LEN] * BATCH_SIZE) rnn_outputs, final_state1 = dynamic_rnn( GRUCell(2 * EMBEDDING_DIM), inputs=self.item_his_eb, sequence_length=self.sequence_length, dtype=tf.float32, scope="gru1") tf.summary.histogram('GRU_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer_1'): att_gru = din_attention(self.item_eb, rnn_outputs, HIDDEN_SIZE, self.mask) att_gru = tf.reduce_sum(att_gru, 1) inp = tf.concat( [self.item_eb, self.item_his_eb_sum, final_state1, att_gru], -1) self.build_fcn_net(inp, use_dice=False)
def build_tf_net(self, datas, is_train=True): super(Model_DIEN,self).build_tf_net(datas, is_train) # RNN layer(-s) # GRU of interest extractor layer with tf.name_scope('rnn_1'): rnn_outputs, _ = dynamic_rnn(GRUCell(self.hidden_size, kernel_initializer=get_tf_initializer()), inputs=self.item_his_eb, sequence_length=self.tensors.seq_len, dtype=tf.float32, scope="gru1") aux_loss_1 = self.auxiliary_loss(rnn_outputs[:, :-1, :], self.item_his_eb[:, 1:, :], self.noclk_item_his_eb[:, 1:, :], self.tensors.mask[:, 1:], stag="gru") self.aux_loss = aux_loss_1 # Attention layer # Attention of interest evolving layer with tf.name_scope('Attention_layer_1'): att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, self.attention_size, self.tensors.mask, softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True) # AUGRU of interest evolving layer with tf.name_scope('rnn_2'): rnn_outputs2, final_state2 = dynamic_rnn(VecAttGRUCell(self.hidden_size, kernel_initializer=get_tf_initializer()), inputs=rnn_outputs, att_scores=tf.expand_dims( alphas, -1), sequence_length=self.tensors.seq_len, dtype=tf.float32, scope="gru2") inp = tf.concat([self.tensors.uid, self.item_eb, final_state2, self.item_his_eb_sum], 1) # after concat and flatten(?), put into Model.fcn self.build_fcn_net(inp, use_dice=True)
def __init__(self, batch, hidden, keep_prob=1.0, is_train=None, scope="ptr_net"): self.gru = GRUCell(hidden) self.batch = batch self.scope = scope self.keep_prob = keep_prob self.is_train = is_train self.dropout_mask = dropout(tf.ones( [batch, hidden], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train)
def _create_rnn_cell(self): cell = GRUCell( self.cfg.num_units) if self.cfg.cell_type == "gru" else LSTMCell( self.cfg.num_units) if self.cfg.use_dropout: cell = DropoutWrapper(cell, output_keep_prob=self.keep_prob) if self.cfg.use_residual: cell = ResidualWrapper(cell) return cell
def _create_single_rnn_cell(self, num_units): cell = GRUCell( num_units) if self.cfg["cell_type"] == "gru" else LSTMCell( num_units) if self.cfg["use_dropout"]: cell = DropoutWrapper(cell, output_keep_prob=self.rnn_keep_prob) if self.cfg["use_residual"]: cell = ResidualWrapper(cell) return cell
def __init__(self, num_layers, num_units, batch_size, input_size, keep_prob=1.0, is_train=None, scope="native_gru", activation=tf.nn.tanh): self.num_layers = num_layers self.grus = [] self.inits = [] self.dropout_mask = [] self.scope = scope for layer in range(num_layers): input_size_ = input_size if layer == 0 else 2 * num_units gru_fw = GRUCell(num_units, activation=activation) gru_bw = GRUCell(num_units, activation=activation) init_fw = tf.tile(tf.Variable(tf.zeros([1, num_units])), [batch_size, 1]) init_bw = tf.tile(tf.Variable(tf.zeros([1, num_units])), [batch_size, 1]) mask_fw = Dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode='') mask_bw = Dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), keep_prob=keep_prob, is_train=is_train, mode='') self.grus.append(( gru_fw, gru_bw, )) self.inits.append(( init_fw, init_bw, )) self.dropout_mask.append(( mask_fw, mask_bw, ))
def __call__(self, features, labels): super(Model_DIN_V2_Gru_Vec_attGru_Neg, self).__call__(features, labels) def dtype_getter(getter, name, dtype=None, *args, **kwargs): var = getter(name, dtype=self.model_dtype, *args, **kwargs) return var with tf.variable_scope("dien", custom_getter=dtype_getter, dtype=self.model_dtype): # RNN layer(-s) with tf.name_scope('rnn_1'): res_1 = GRUCell(self.HIDDEN_SIZE) #res_2 = CudnnGRU(self.HIDDEN_SIZE) rnn_outputs, _ = dynamic_rnn(res_1, inputs=self.item_his_eb, sequence_length=self.seq_len_ph, dtype=self.model_dtype, scope="gru1") tf.summary.histogram('GRU_outputs', rnn_outputs) aux_loss_1 = self.auxiliary_loss(rnn_outputs[:, :-1, :], self.item_his_eb[:, 1:, :], self.noclk_item_his_eb[:, 1:, :], self.mask[:, 1:], stag="gru") self.aux_loss = aux_loss_1 # Attention layer with tf.name_scope('Attention_layer_1'): att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, self.ATTENTION_SIZE, self.mask, softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True, forCnn=True) tf.summary.histogram('alpha_outputs', alphas) with tf.name_scope('rnn_2'): rnn_outputs2, final_state2 = dynamic_rnn( VecAttGRUCell(self.HIDDEN_SIZE), inputs=rnn_outputs, att_scores=tf.expand_dims(alphas, -1), sequence_length=self.seq_len_ph, dtype=self.model_dtype, scope="gru2") tf.summary.histogram('GRU2_Final_State', final_state2) inp = tf.concat([ self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2 ], 1) self.build_fcn_net(inp, use_dice=True)
def add_conversational_context(self, sentence_summary_size): line_vectors_as_timesteps = tf.expand_dims(self.sentence_summaries, 0) if self.config.bidirectional_conversations: forwardcell = DropoutWrapper(GRUCell(sentence_summary_size), self.dropout_placeholder, self.dropout_placeholder) backwardcell = DropoutWrapper(GRUCell(sentence_summary_size), self.dropout_placeholder, self.dropout_placeholder) outputs, sf, sb = bidirectional_dynamic_rnn(forwardcell, backwardcell, line_vectors_as_timesteps, tf.slice(tf.shape(line_vectors_as_timesteps),[1],[1]), # what the f*****g f**k dtype = tf.float32, scope = "ChapterRNN") self.conversation_state = tf.squeeze(outputs) return 2*sentence_summary_size else: rnncell = DropoutWrapper(GRUCell(sentence_summary_size), self.dropout_placeholder, self.dropout_placeholder) outputs, state = tf.nn.dynamic_rnn(rnncell, line_vectors_as_timesteps, tf.slice(tf.shape(line_vectors_as_timesteps),[1],[1]), # what the f*****g f**k dtype = tf.float32, scope = "ChapterRNN") self.conversation_state = tf.squeeze(outputs) return sentence_summary_size
def encoder_impl(self, encoder_input, is_training): dropout_rate = self._config.dropout_rate if is_training else 0.0 # Mask encoder_mask = tf.to_int32(tf.not_equal(encoder_input, 0)) sequence_lengths = tf.reduce_sum(encoder_mask, axis=1) # Embedding encoder_output = embedding(encoder_input, vocab_size=self._config.src_vocab_size, dense_size=self._config.hidden_units, kernel=self._src_embedding, multiplier=self._config.hidden_units**0.5 if self._config.scale_embedding else 1.0, name="src_embedding") # Dropout encoder_output = tf.layers.dropout(encoder_output, rate=dropout_rate, training=is_training) cell_fw = GRUCell(num_units=self._config.hidden_units, name='fw_cell') cell_bw = GRUCell(num_units=self._config.hidden_units, name='bw_cell') # RNN encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=encoder_output, sequence_length=sequence_lengths, dtype=tf.float32) encoder_output = tf.concat(encoder_outputs, axis=2) # Dropout encoder_output = tf.layers.dropout(encoder_output, rate=dropout_rate, training=is_training) # Mask encoder_output *= tf.expand_dims(tf.to_float(encoder_mask), axis=-1) return encoder_output
def build_model(self): with tf.variable_scope("inferring_module"): rdim = 768 update_num = self.update_num batch_size = tf.shape(self.sent1)[0] dim = self.sent1.get_shape().as_list()[-1] # gru_layer = BiGRU(num_layers=1, num_units=rdim, batch_size=batch_size, # input_size=dim, keep_prob=0.9, is_train=self.is_training, # activation=tf.nn.tanh) # sent1_len = tf.cast(tf.reduce_sum(self.sent1_mask, axis=1), tf.int32) # sent2_len = tf.cast(tf.reduce_sum(self.sent2_mask, axis=1), tf.int32) # self.sent1 = gru_layer(self.sent1, sent1_len) # self.sent2 = gru_layer(self.sent2, sent2_len) sr_cell = GRUCell(num_units=2 * rdim, activation=tf.nn.relu) r_cell = sr_cell tri_cell = DoubleJointCell(num_units=2 * rdim, r_cell=r_cell, sent1=self.sent1, sent2=self.sent2, dim=dim, update_num=update_num, use_bias=False, activation=tf.tanh, dropout_rate=self.dropout_rate, sent1_mask=self.sent1_mask, sent2_mask=self.sent2_mask, initializer=None, dtype=tf.float32) fake_input = tf.tile(tf.expand_dims(self.mark0, axis=1), [1, update_num, 1]) self.init_state = tri_cell.zero_state(batch_size=batch_size, dtype=tf.float32) self.double_output, last_state = dynamic_rnn( cell=tri_cell, inputs=fake_input, initial_state=self.init_state) refer_output = tf.reduce_mean(self.double_output, axis=1) # (B, dim) # temp = tf.concat([refer_output, self.mark0], axis=1) # # temp = dropout(temp, self.dropout_rate) refer_output = tf.layers.dense( refer_output, 768, activation=tf.nn.tanh, kernel_initializer=create_initializer(0.02)) # return refer_output * (1 - gate) + gate * self.mark0 return refer_output + self.mark0
def __init__(self, num_layers, num_units, cell_type='lstm', scope='stack_bi_rnn'): if type(num_units) == list: assert len( num_units ) == num_layers, "if num_units is a list, then its size should equal to num_layers" self.cells_fw = [LSTMCell(num_units[i]) for i in range(num_layers)] if cell_type == 'lstm' else \ [GRUCell(num_units[i]) for i in range(num_layers)] self.cells_bw = [LSTMCell(num_units[i]) for i in range(num_layers)] if cell_type == 'lstm' else \ [GRUCell(num_units[i]) for i in range(num_layers)] else: self.cells_fw = [LSTMCell(num_units) for _ in range(num_layers)] if cell_type == 'lstm' else \ [GRUCell(num_units) for _ in range(num_layers)] self.cells_bw = [LSTMCell(num_units) for _ in range(num_layers)] if cell_type == 'lstm' else \ [GRUCell(num_units) for _ in range(num_layers)] self.num_layers = num_layers self.scope = scope
def build_graph(self): # RNN layer(-s) with tf.name_scope('rnn_1'): rnn_outputs, _ = dynamic_rnn( GRUCell(self.HIDDEN_SIZE), inputs=self.item_his_eb, max_iteration=self.options.max_rnn_while_loops, sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru1") tf.summary.histogram('GRU_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer_1'): att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, self.ATTENTION_SIZE, self.mask, softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True) tf.summary.histogram('alpha_outputs', alphas) with tf.name_scope('rnn_2'): rnn_outputs2, final_state2 = dynamic_rnn( GRUCell(self.HIDDEN_SIZE), inputs=att_outputs, max_iteration=self.options.max_rnn_while_loops, sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru2") tf.summary.histogram('GRU2_Final_State', final_state2) inp = tf.concat([ self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2 ], 1) # Fully connected layer self.build_fcn_net(inp, use_dice=True)
def __init__(self, config): self.config = config self.input = tf.placeholder( 'int32', [self.config.batch_size, config.max_seq_len], name='input') self.labels = tf.placeholder('int64', [self.config.batch_size], name='labels') self.labels_one_hot = tf.one_hot(indices=self.labels, depth=config.output_dim, on_value=1.0, off_value=0.0, axis=-1) self.gru = GRUCell(config.hidden_state_dim) embeddings_we = tf.get_variable( 'word_embeddings', initializer=tf.random_uniform( [config.vocab_size, config.embedding_dim], -1.0, 1.0)) self.emb = embed_input = tf.nn.embedding_lookup( embeddings_we, self.input) inputs = [ tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(1, config.max_seq_len, embed_input) ] outputs, last_slu_state = tf.nn.rnn( cell=self.gru, inputs=inputs, dtype=tf.float32, ) w_project = tf.get_variable( 'project2labels', initializer=tf.random_uniform( [config.hidden_state_dim, config.output_dim], -1.0, 1.0)) self.logits = logits_bo = tf.matmul(last_slu_state, w_project) tf.histogram_summary('logits', logits_bo) self.probabilities = tf.nn.softmax(logits_bo) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits_bo, self.labels_one_hot)) self.predict = tf.nn.softmax(logits_bo) # TensorBoard self.accuracy = tf.reduce_mean(tf.cast( tf.equal(tf.argmax(self.predict, 1), self.labels), 'float32'), name='accuracy') tf.scalar_summary('CCE loss', self.loss) tf.scalar_summary('Accuracy', self.accuracy) self.tb_info = tf.merge_all_summaries()
def createGraph(self): self.input = tf.placeholder(tf.int32, [self.batch_size, self.seq_len], name='inputs') self.targs = tf.placeholder(tf.int32, [self.batch_size, self.seq_len], name='targets') onehot = tf.one_hot(self.input, self.vocab_size, name='input_oh') inputs = tf.split(onehot, self.seq_len, 1) inputs = [tf.squeeze(i, [1]) for i in inputs] targets = tf.split(self.targs, self.seq_len, 1) with tf.variable_scope("posRNN"): cells = [GRUCell(self.num_hidden) for _ in range(self.num_layers)] stacked = MultiRNNCell(cells, state_is_tuple=True) self.zero_state = stacked.zero_state(self.batch_size, tf.float32) outputs, self.last_state = seq2seq.rnn_decoder( inputs, self.zero_state, stacked) w = tf.get_variable( "w", [self.num_hidden, self.vocab_size], tf.float32, initializer=tf.random_normal_initializer(stddev=0.02)) b = tf.get_variable("b", [self.vocab_size], initializer=tf.constant_initializer(0.0)) logits = [tf.matmul(o, w) + b for o in outputs] const_weights = [ tf.ones([self.batch_size]) for _ in xrange(self.seq_len) ] self.loss = seq2seq.sequence_loss(logits, targets, const_weights) self.opt = tf.train.AdamOptimizer(0.001, beta1=0.5).minimize(self.loss) with tf.variable_scope("posRNN", reuse=True): batch_size = 1 self.s_inputs = tf.placeholder(tf.int32, [batch_size], name='s_inputs') s_onehot = tf.one_hot(self.s_inputs, self.vocab_size, name='s_input_oh') self.s_zero_state = stacked.zero_state(batch_size, tf.float32) s_outputs, self.s_last_state = seq2seq.rnn_decoder( [s_onehot], self.s_zero_state, stacked) s_outputs = tf.reshape(s_outputs, [1, self.num_hidden]) self.s_probs = tf.nn.softmax(tf.matmul(s_outputs, w) + b)
def __init__(self, n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=False, use_others=False): super(Model_DIN_V2_Gru_Vec_attGru, self).__init__(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling, use_others) with self.grath.as_default(): # RNN layer(-s) with tf.name_scope('rnn_1'): rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE), inputs=self.item_his_eb, sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru1") tf.summary.histogram('GRU_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer_1'): att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, ATTENTION_SIZE, self.mask, softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True) tf.summary.histogram('alpha_outputs', alphas) with tf.name_scope('rnn_2'): rnn_outputs2, final_state2 = dynamic_rnn( VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs, att_scores=tf.expand_dims(alphas, -1), sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru2") tf.summary.histogram('GRU2_Final_State', final_state2) # inp = tf.concat([self.uid_batch_embedded, self.item_eb, final_state2, self.item_his_eb_sum], 1) inp = tf.concat([ self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2 ], 1) if self.use_others: inp = tf.concat([inp] + list(self.other_inputs()), 1) self.build_fcn_net(inp, use_dice=True)
def __init__(self, hidden_num, cell=None, reverse=True, decode_without_input=False): if cell is None: self._enc_cell = GRUCell(hidden_num, name='encoder_cell') self._dec_cell = GRUCell(hidden_num, name='decoder_cell') else: self._enc_cell = cell self._dec_cell = cell self.reverse = reverse self.decode_without_input = decode_without_input self.hidden_num = hidden_num if FLAGS.datasource in ['sinusoid', 'mixture']: self.elem_num_init = 2 self.elem_num=20 elif FLAGS.datasource in ['miniimagenet', 'omniglot', 'multidataset']: self.elem_num = FLAGS.num_classes + 64 self.dec_weight = tf.Variable(tf.truncated_normal([self.hidden_num, self.elem_num], dtype=tf.float32), name='dec_weight') self.dec_bias = tf.Variable(tf.constant(0.1, shape=[self.elem_num], dtype=tf.float32), name='dec_bias')
def prediction(self): # Recurrent network. network = GRUCell(self._num_hidden) network = DropoutWrapper(network, output_keep_prob=self.dropout) network = MultiRNNCell([network] * self._num_layers) output, _ = tf.nn.dynamic_rnn(network, data, dtype=tf.float32) # Select last output. output = tf.transpose(output, [1, 0, 2]) last = tf.gather(output, int(output.get_shape()[0]) - 1) # Softmax layer. weight, bias = self._weight_and_bias(self._num_hidden, int(self.target.get_shape()[1])) prediction = tf.nn.softmax(tf.matmul(last, weight) + bias) return prediction
def __init__(self, hidden_num, input_num, cell=None, reverse=True, decode_without_input=False, name=None): self.name=name if cell is None: self._enc_cell = GRUCell(hidden_num, name='encoder_cell_{}'.format(self.name)) self._dec_cell = GRUCell(hidden_num, name='decoder_cell_{}'.format(self.name)) else: self._enc_cell = cell self._dec_cell = cell self.reverse = reverse self.decode_without_input = decode_without_input self.hidden_num = hidden_num if FLAGS.datasource in ['2D']: self.elem_num_init = 2 self.elem_num=FLAGS.sync_filters elif FLAGS.datasource in ['plainmulti', 'artmulti']: self.elem_num = input_num self.dec_weight = tf.Variable(tf.truncated_normal([self.hidden_num, self.elem_num], dtype=tf.float32), name='dec_weight_{}'.format(self.name)) self.dec_bias = tf.Variable(tf.constant(0.1, shape=[self.elem_num], dtype=tf.float32), name='dec_bias_{}'.format(self.name))
def __init__(self, feature_size, eb_dim, hidden_size, max_len_item, max_len_user, item_part_fnum, user_part_fnum, use_hist_u, use_hist_i, emb_initializer): super(GRU4Rec, self).__init__(feature_size, eb_dim, hidden_size, max_len_item, max_len_user, item_part_fnum, user_part_fnum, use_hist_u, use_hist_i, emb_initializer) # RNN layer with tf.name_scope('item_rnn'): _, item_part_final_state = tf.nn.dynamic_rnn(GRUCell(hidden_size), inputs=self.item_part_emb, sequence_length=self.item_len_ph, dtype=tf.float32, scope='gru1') item_part = item_part_final_state with tf.name_scope('user_rnn'): _, user_part_final_state = tf.nn.dynamic_rnn(GRUCell(hidden_size), inputs=self.user_part_emb, sequence_length=self.user_len_ph, dtype=tf.float32, scope='gru2') user_part = user_part_final_state if use_hist_i and use_hist_u: inp = tf.concat([item_part, user_part], axis=1) elif use_hist_i and not use_hist_u: inp = item_part elif not use_hist_i and use_hist_u: inp = user_part # fully connected layer self.build_fc_net(inp) self.build_loss()
def main(c): ''' params: c: config dictionary ''' # Data --------------------------------------------------------------------------------------------------- data_portion = None # 2 * batch_size train_set = Dstc2('data/dstc2/data.dstc2.train.json', sample_unk=0.01, first_n=data_portion) valid_set = Dstc2('data/dstc2/data.dstc2.dev.json', first_n=data_portion, sample_unk=0, max_dial_len=train_set.max_dial_len, words_vocab=train_set.words_vocab, labels_vocab=train_set.labels_vocab, labels_vocab_separate=train_set.labels_vocab_separate) test_set = Dstc2('data/dstc2/data.dstc2.test.json', first_n=data_portion, sample_unk=0, max_dial_len=train_set.max_dial_len, words_vocab=train_set.words_vocab, labels_vocab=train_set.labels_vocab, labels_vocab_separate=train_set.labels_vocab_separate) stats(train_set, valid_set, test_set) vocab_size = len(train_set.words_vocab) output_dim = max(np.unique(train_set.labels)) + 1 n_train_batches = len(train_set.dialogs) // c.batch_size # output dimensions for each separate label output_dims = [] for i in range(3): o_d = max(np.unique(train_set.labels_separate[:,:,i])) + 1 output_dims.append(o_d) # Model ----------------------------------------------------------------------------------------------------- logging.info('Creating model') input_bt = tf.placeholder('int32', [c.batch_size, train_set.max_turn_len], name='input') turn_lens_b = tf.placeholder('int32', [c.batch_size], name='turn_lens') mask_b = tf.placeholder('int32', [c.batch_size], name='dial_mask') # labels_b = tf.placeholder('int64', [c.batch_size], name='labels') # onehot_labels_bo = tf.one_hot(indices=labels_b, # depth=output_dim, # on_value=1.0, # off_value=0.0, # axis=-1) # separate labels and their onehots labels0_b, onehot_labels0_bo0 = get_labels_with_onehot(c.batch_size, output_dims[0], 'labels0') labels1_b, onehot_labels1_bo1 = get_labels_with_onehot(c.batch_size, output_dims[1], 'labels1') labels2_b, onehot_labels2_bo2 = get_labels_with_onehot(c.batch_size, output_dims[2], 'labels2') is_first_turn = tf.placeholder(tf.bool) gru = GRUCell(c.hidden_state_dim) embeddings_we = tf.get_variable('word_embeddings', initializer=tf.random_uniform([vocab_size, c.embedding_dim], -1.0, 1.0)) embedded_input_bte = tf.nn.embedding_lookup(embeddings_we, input_bt) dialog_state_before_turn = tf.get_variable('dialog_state_before_turn', initializer=tf.zeros([c.batch_size, c.hidden_state_dim], dtype='float32'), trainable=False) before_state_bh = cond(is_first_turn, lambda: gru.zero_state(c.batch_size, dtype='float32'), lambda: dialog_state_before_turn) inputs = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(1, train_set.max_turn_len, embedded_input_bte)] outputs, state_bh = tf.nn.rnn(cell=gru, inputs=inputs, initial_state=before_state_bh, sequence_length=turn_lens_b, dtype=tf.float32) dialog_state_before_turn.assign(state_bh) # projection_ho = tf.get_variable('project2labels', # initializer=tf.random_uniform([c.hidden_state_dim, output_dim], -1.0, 1.0)) # logits_bo = tf.matmul(state_bh, projection_ho) # tf.histogram_summary('logits', logits_bo) # probabilities_bo = tf.nn.softmax(logits_bo) # tf.histogram_summary('probabilities', probabilities_bo) # logits and probabilites and predictions from hidden state logits_bo0, probabilities_bo0, predict_b0 = get_logits_and_probabilities(state_bh, c.hidden_state_dim, output_dims[0], 'labels0') logits_bo1, probabilities_bo1, predict_b1 = get_logits_and_probabilities(state_bh, c.hidden_state_dim, output_dims[1], 'labels1') logits_bo2, probabilities_bo2, predict_b2 = get_logits_and_probabilities(state_bh, c.hidden_state_dim, output_dims[2], 'labels2') float_mask_b = tf.cast(mask_b, 'float32') # loss = tf.reduce_sum(tf.mul(float_mask_b, x_entropy(logits_bo, onehot_labels_bo))) / tf.reduce_sum(float_mask_b) # tf.scalar_summary('CCE loss', loss) # losses loss_0 = tf.reduce_sum(tf.mul(float_mask_b, x_entropy(logits_bo0, onehot_labels0_bo0))) / tf.reduce_sum(float_mask_b) loss_1 = tf.reduce_sum(tf.mul(float_mask_b, x_entropy(logits_bo1, onehot_labels1_bo1))) / tf.reduce_sum(float_mask_b) loss_2 = tf.reduce_sum(tf.mul(float_mask_b, x_entropy(logits_bo2, onehot_labels2_bo2))) / tf.reduce_sum(float_mask_b) loss = loss_0 + loss_1 + loss_2 tf.scalar_summary('CCE loss', loss) # predict_b = tf.argmax(logits_bo, 1) # correct = tf.cast(tf.equal(predict_b, labels_b), 'float32') # accuracy = tf.reduce_sum(tf.mul(correct, float_mask_b)) / tf.reduce_sum(float_mask_b) # tf.scalar_summary('Accuracy', accuracy) # correct correct_0 = tf.cast(tf.equal(predict_b0, labels0_b), 'float32') correct_1 = tf.cast(tf.equal(predict_b1, labels1_b), 'float32') correct_2 = tf.cast(tf.equal(predict_b2, labels2_b), 'float32') correct_all = tf.mul(tf.mul(correct_0, correct_1), correct_2) # accuracies accuracy_0 = get_accuracy(correct_0, float_mask_b) accuracy_1 = get_accuracy(correct_1, float_mask_b) accuracy_2 = get_accuracy(correct_2, float_mask_b) accuracy_all = get_accuracy(correct_all, float_mask_b) tf.scalar_summary('Accuracy all', accuracy_all) tf.scalar_summary('Accuracy label 0', accuracy_0) tf.scalar_summary('Accuracy label 1', accuracy_1) tf.scalar_summary('Accuracy label 2', accuracy_2) tb_info = tf.merge_all_summaries() # Optimizer ----------------------------------------------------------------------------------------------------- logging.info('Creating optimizer') optimizer = tf.train.AdamOptimizer(c.learning_rate) logging.info('Creating train_op') train_op = optimizer.minimize(loss) # Session ----------------------------------------------------------------------------------------------------- logging.info('Creating session') sess = tf.Session() logging.info('Initing variables') init = tf.initialize_all_variables() logging.info('Running session') sess.run(init) # TB --------------------------------------------------------------------------------------------------------- logging.info('See stats via tensorboard: $ tensorboard --logdir %s', c.log_dir) train_writer = tf.train.SummaryWriter(c.log_dir, sess.graph) # Train --------------------------------------------------------------------------------------------------------- train_summary = None step, stopper = 0, EarlyStopper(c.nbest_models, c.not_change_limit, c.name) try: for e in range(c.epochs): logging.info('------------------------------') logging.info('Epoch %d', e) total_loss = 0 total_acc = 0 batch_count = 0 for bid, (dialogs_bTt, lengths_bT, labels0_bT, labels1_bT, labels2_bT, masks_bT) in enumerate(next_batch(train_set, c.batch_size)): turn_loss = 0 turn_acc = 0 n_turns = 0 first_run = True for (turn_bt, label0_b, label1_b, label2_b, lengths_b, masks_b) in zip(dialogs_bTt.transpose([1, 0, 2]), labels0_bT.transpose([1, 0]), labels1_bT.transpose([1, 0]), labels2_bT.transpose([1, 0]), lengths_bT.transpose([1, 0]), masks_bT.transpose([1,0])): if sum(masks_b) == 0: break _, batch_loss, batch_accuracy, train_summary = sess.run([train_op, loss, accuracy_all, tb_info], feed_dict={input_bt: turn_bt, turn_lens_b: lengths_b, mask_b: masks_b, labels0_b: label0_b, labels1_b: label1_b, labels2_b: label2_b, is_first_turn: first_run}) first_run = False turn_loss += batch_loss turn_acc += batch_accuracy n_turns += 1 step += 1 total_loss += turn_loss / n_turns total_acc += turn_acc / n_turns batch_count += 1 logging.info('Batch %d/%d\r', bid, n_train_batches) train_writer.add_summary(train_summary, e) logging.info('Train cost %f', total_loss / batch_count) logging.info('Train accuracy: %f', total_acc / batch_count) def monitor_stream(work_set, name): total_loss = 0 total_acc = 0 n_valid_batches = 0 for bid, (dialogs_bTt, lengths_bT, labels0_bT, labels1_bT, labels2_bT, masks_bT) in enumerate(next_batch(work_set, c.batch_size)): turn_loss = 0 turn_acc = 0 n_turns = 0 first_run = True for (turn_bt, label0_b, label1_b, label2_b, lengths_b, masks_b) in zip(dialogs_bTt.transpose([1, 0, 2]), labels0_bT.transpose([1, 0]), labels1_bT.transpose([1, 0]), labels2_bT.transpose([1, 0]), lengths_bT.transpose([1, 0]), masks_bT.transpose([1,0])): if sum(masks_b) == 0: break input = np.pad(turn_bt, ((0, 0), (0, train_set.max_turn_len-turn_bt.shape[1])), 'constant', constant_values=0) if train_set.max_turn_len > turn_bt.shape[1]\ else turn_bt batch_loss, batch_acc, valid_summary = sess.run([loss, accuracy_all, tb_info], feed_dict={input_bt: input, turn_lens_b: lengths_b, labels0_b: label0_b, labels1_b: label1_b, labels2_b: label2_b, mask_b: masks_b, is_first_turn: first_run}) turn_loss += batch_loss turn_acc += batch_acc first_run = False n_turns += 1 total_loss += turn_loss / n_turns total_acc += turn_acc / n_turns n_valid_batches += 1 logging.info('%s cost: %f', name, total_loss/n_valid_batches) logging.info('%s accuracy: %f', name, total_acc/n_valid_batches) return total_loss/n_valid_batches stopper_reward = monitor_stream(valid_set, 'Valid') monitor_stream(test_set, 'Test') if not stopper.save_and_check(stopper_reward, step, sess): raise RuntimeError('Training not improving on dev set') finally: logging.info('Training stopped after %7d steps and %7.2f epochs. See logs for %s', step, step / len(train_set), c.log_name) logging.info('Saving current state. Please wait!\nBest model has reward %7.2f form step %7d is %s' % stopper.highest_reward()) stopper.saver.save(sess=sess, save_path='%s-FINAL-%.4f-step-%07d' % (stopper.saver_prefix, stopper_reward, step))
def main(): # Config ----------------------------------------------------------------------------------------------------- learning_rate = 0.005 batch_size = 16 epochs = 50 hidden_state_dim = 200 embedding_dim = 300 log_dir = 'log' # Data --------------------------------------------------------------------------------------------------- data_portion = 2 * batch_size train_set = Dstc2('../data/dstc2/data.dstc2.train.json', sample_unk=0.01, first_n=data_portion) valid_set = Dstc2('../data/dstc2/data.dstc2.dev.json', first_n=data_portion, sample_unk=0, max_dial_len=train_set.max_dial_len, words_vocab=train_set.words_vocab, labels_vocab=train_set.labels_vocab) test_set = Dstc2('../data/dstc2/data.dstc2.test.json', first_n=data_portion, sample_unk=0, max_dial_len=train_set.max_dial_len, words_vocab=train_set.words_vocab, labels_vocab=train_set.labels_vocab) vocab_size = len(train_set.words_vocab) output_dim = max(np.unique(train_set.labels)) + 1 n_train_batches = len(train_set.dialogs) // batch_size # Model ----------------------------------------------------------------------------------------------------- logging.info('Creating model') input_bt = tf.placeholder('int32', [batch_size, train_set.max_turn_len], name='input') turn_lens_b = tf.placeholder('int32', [batch_size], name='turn_lens') mask_b = tf.placeholder('int32', [batch_size], name='dial_mask') # mask_bT = lengths2mask2d(dial_lens_b, train_set.max_dial_len) labels_b = tf.placeholder('int64', [batch_size], name='labels') onehot_labels_bo = tf.one_hot(indices=labels_b, depth=output_dim, on_value=1.0, off_value=0.0, axis=-1) is_first_turn = tf.placeholder(tf.bool) gru = GRUCell(hidden_state_dim) mlp_hidden_layer_dim = 50 mlp_input2hidden_W = tf.get_variable('in2hid', initializer=tf.random_normal([hidden_state_dim, mlp_hidden_layer_dim])) mlp_input2hidden_B = tf.Variable(tf.random_normal([mlp_hidden_layer_dim])) mlp_hidden2output_W = tf.get_variable('hid2out', initializer=tf.random_normal([mlp_hidden_layer_dim, output_dim])) mlp_hidden2output_B = tf.Variable(tf.random_normal([output_dim])) embeddings_we = tf.get_variable('word_embeddings', initializer=tf.random_uniform([vocab_size, embedding_dim], -1.0, 1.0)) embedded_input_bte = tf.nn.embedding_lookup(embeddings_we, input_bt) dialog_state_before_turn = tf.get_variable('dialog_state_before_turn', initializer=tf.zeros([batch_size, hidden_state_dim], dtype='float32'), trainable=False) before_state_bh = cond(is_first_turn, lambda: gru.zero_state(batch_size, dtype='float32'), lambda: dialog_state_before_turn) inputs = [tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(1, train_set.max_turn_len, embedded_input_bte)] outputs, state_bh = tf.nn.rnn(cell=gru, inputs=inputs, initial_state=before_state_bh, sequence_length=turn_lens_b, dtype=tf.float32) # state_tbh = scan(fn=lambda last_state_bh, curr_input_bte: gru(curr_input_bte, last_state_bh)[1], # elems=tf.transpose(embedded_input_bte, perm=[1, 0, 2]), # initializer=before_state_bh) # state_bh = state_tbh[state_tbh.get_shape()[0]-1, :, :] dialog_state_before_turn.assign(state_bh) projection_ho = tf.get_variable('project2labels', initializer=tf.random_uniform([hidden_state_dim, output_dim], -1.0, 1.0)) logits_bo = tf.matmul(state_bh, projection_ho) # hidden = tf.add(tf.matmul(state_bh, mlp_input2hidden_W), mlp_input2hidden_B) # logits_bo = tf.add(tf.matmul(hidden, mlp_hidden2output_W), mlp_hidden2output_B tf.histogram_summary('logits', logits_bo) probabilities_bo = tf.nn.softmax(logits_bo) tf.histogram_summary('probabilities', probabilities_bo) float_mask_b = tf.cast(mask_b,'float32') # loss = tf.matmul(tf.expand_dims(tf.cast(mask_b, 'float32'), 0), tf.nn.softmax_cross_entropy_with_logits(logits_bo, onehot_labels_bo)) / tf.reduce_sum(mask_b) loss = tf.reduce_sum(tf.mul(float_mask_b, tf.nn.softmax_cross_entropy_with_logits(logits_bo, onehot_labels_bo))) / tf.reduce_sum(float_mask_b) tf.scalar_summary('CCE loss', loss) predict_b = tf.argmax(logits_bo, 1) correct = tf.cast(tf.equal(predict_b, labels_b), 'float32') accuracy = tf.reduce_sum(tf.mul(correct, float_mask_b)) / tf.reduce_sum(float_mask_b) tf.scalar_summary('Accuracy', accuracy) tb_info = tf.merge_all_summaries() # Optimizer ----------------------------------------------------------------------------------------------------- logging.info('Creating optimizer') optimizer = tf.train.AdamOptimizer(learning_rate) logging.info('Creating train_op') train_op = optimizer.minimize(loss) # Session ----------------------------------------------------------------------------------------------------- logging.info('Creating session') sess = tf.Session() logging.info('Initing variables') init = tf.initialize_all_variables() logging.info('Running session') sess.run(init) # TB --------------------------------------------------------------------------------------------------------- logging.info('See stats via tensorboard: $ tensorboard --logdir %s', log_dir) train_writer = tf.train.SummaryWriter(log_dir, sess.graph) # Train --------------------------------------------------------------------------------------------------------- train_summary = None for e in range(epochs): logging.info('------------------------------') logging.info('Epoch %d', e) total_loss = 0 total_acc = 0 batch_count = 0 for bid, (dialogs_bTt, lengths_bT, labels_bT, masks_bT) in enumerate(next_batch(train_set, batch_size)): turn_loss = 0 turn_acc = 0 n_turns = 0 first_run = True for (turn_bt, label_b, lengths_b, masks_b) in zip(dialogs_bTt.transpose([1,0,2]), labels_bT.transpose([1,0]), lengths_bT.transpose([1,0]), masks_bT.transpose([1,0])): if sum(masks_b) == 0: break _, batch_loss, batch_accuracy, train_summary = sess.run([train_op, loss, accuracy, tb_info], feed_dict={input_bt: turn_bt, turn_lens_b: lengths_b, mask_b: masks_b, labels_b: label_b, is_first_turn:first_run}) first_run = False turn_loss += batch_loss turn_acc += batch_accuracy n_turns += 1 total_loss += turn_loss / n_turns total_acc += turn_acc / n_turns batch_count += 1 logging.info('Batch %d/%d\r', bid, n_train_batches) train_writer.add_summary(train_summary, e) logging.info('Average train cost %f', total_loss / batch_count) logging.info('Average train accuracy: %f', total_acc / batch_count) def monitor_stream(work_set, name): total_loss = 0 total_acc = 0 n_valid_batches = 0 for bid, (dialogs_bTt, lengths_bT, labels_bT, masks_bT) in enumerate(next_batch(work_set, batch_size)): turn_loss = 0 turn_acc = 0 n_turns = 0 first_run = True for (turn_bt, label_b, lengths_b, masks_b) in zip(dialogs_bTt.transpose([1,0,2]), labels_bT.transpose([1,0]), lengths_bT.transpose([1,0]), masks_bT.transpose([1,0])): if sum(masks_b) == 0: break input = np.pad(turn_bt, ((0,0), (0, train_set.max_turn_len-turn_bt.shape[1])), 'constant', constant_values=0) if train_set.max_turn_len > turn_bt.shape[1] else turn_bt predictions, batch_loss, batch_acc, valid_summary = sess.run([predict_b, loss, accuracy, tb_info], feed_dict={input_bt: input, turn_lens_b: lengths_b, labels_b: label_b, mask_b: masks_b, is_first_turn:first_run}) turn_loss += batch_loss turn_acc += batch_acc first_run = False n_turns += 1 total_loss += turn_loss / n_turns total_acc += turn_acc / n_turns n_valid_batches += 1 logging.info('%s cost: %f', name, total_loss/n_valid_batches) logging.info('%s accuracy: %f', name, total_acc/n_valid_batches) monitor_stream(valid_set, 'Valid') monitor_stream(test_set, 'Test')