def rnn(self): """rnn模型""" # 词向量映射 with tf.device('/cpu:0'): embedding = tf.get_variable('embedding', initializer=self.word_embedding) embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x) embedding_inputs = tf.cast(embedding_inputs, tf.float32) with tf.name_scope("rnn"): # 多层rnn网络 #cells = [dropout() for _ in range(self.config.num_layers)] #rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True) #_outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32) #last = _outputs[:, -1, :] # 取最后一个时序输出作为结果 # (Bi-)RNN layer(-s) rnn_outputs, _ = bi_rnn(GRUCell(self.config.hidden_dim), GRUCell(self.config.hidden_dim), inputs=embedding_inputs, dtype=tf.float32) #tf.summary.histogram('RNN_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas = attention(rnn_outputs, self.config.attention_size, return_alphas=True) tf.summary.histogram('alphas', alphas) # Dropout drop = tf.nn.dropout(attention_output, self.keep_prob) with tf.name_scope("score"): # 全连接层,后面接dropout以及relu激活 W = tf.Variable( tf.truncated_normal( [self.config.hidden_dim * 2, self.config.num_classes], stddev=0.1)) # Hidden size is multiplied by 2 for Bi-RNN b = tf.Variable(tf.constant(0., shape=[self.config.num_classes])) self.logits = tf.nn.xw_plus_b(drop, W, b) #fc = tf.contrib.layers.dropout(fc, self.keep_prob) #fc = tf.nn.relu(fc) # 分类器 #self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2') self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别 with tf.name_scope("optimize"): # 损失函数,交叉熵 cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.input_y) self.loss = tf.reduce_mean(cross_entropy) # 优化器 self.optim = tf.train.AdamOptimizer( learning_rate=self.config.learning_rate).minimize(self.loss) with tf.name_scope("accuracy"): # 准确率 correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls) self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def cal_loss_logit(self, batch_embedded, keep_prob, W, W_fc, b_fc, batch_y, reuse=True, scope="loss"): with tf.variable_scope(scope, reuse=reuse) as scope: rnn_outputs, _ = bi_rnn(BasicLSTMCell(self.config.hidden_dim), BasicLSTMCell(self.config.hidden_dim), inputs=batch_embedded, dtype=tf.float32) # Attention ATTENTION_SIZE = 50 attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True) drop = tf.nn.dropout(attention_output, keep_prob) # Fully connected layer y_hat = tf.nn.xw_plus_b(drop, W_fc, b_fc) y_hat = tf.squeeze(y_hat) return y_hat, tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=y_hat, labels=batch_y))
def cal_loss_logit(embedded, keep_prob, reuse=True, scope="loss"): with tf.variable_scope(scope, reuse=reuse) as scope: rnn_outputs, _ = bi_rnn(BasicLSTMCell(self.hidden_size), BasicLSTMCell(self.hidden_size), inputs=embedded, dtype=tf.float32) # Attention H = tf.add(rnn_outputs[0], rnn_outputs[1]) # fw + bw M = tf.tanh( H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE) # alpha (bs * sl, 1) alpha = tf.nn.softmax( tf.matmul(tf.reshape(M, [-1, self.hidden_size]), tf.reshape(W, [-1, 1]))) r = tf.matmul(tf.transpose( H, [0, 2, 1]), tf.reshape( alpha, [-1, self.max_len, 1 ])) # supposed to be (batch_size * HIDDEN_SIZE, 1) r = tf.squeeze(r) h_star = tf.tanh(r) drop = tf.nn.dropout(h_star, keep_prob) # Fully connected layer(dense layer) y_hat = tf.nn.xw_plus_b(drop, W_fc, b_fc) return y_hat, tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=y_hat, labels=self.label))
def bi_gru_att(self): sen_inputs_glove = self.embedding_layer() fw_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell( self.lstm_units), output_keep_prob=0.75) bw_cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell( self.lstm_units), output_keep_prob=0.75) rnn_outputs, _ = bi_rnn(fw_cell, bw_cell, inputs=sen_inputs_glove, dtype=tf.float32) fw_outputs, bw_outputs = rnn_outputs value = fw_outputs + bw_outputs self.comput_att(value) r = tf.matmul(tf.transpose(value, [0, 2, 1]), tf.reshape(self.alpha, [-1, self.max_len, 1])) r = tf.squeeze(r) h_star = tf.tanh(r) h_drop = tf.nn.dropout(h_star, self.dropout_keep_prob) return h_drop
def build_graph(self): print("building graph") # Word embedding embeddings_var = tf.Variable(tf.random_uniform( [self.vocab_size, self.embedding_size], -1.0, 1.0), trainable=True) batch_embedded = tf.nn.embedding_lookup(embeddings_var, self.x) rnn_outputs, _ = bi_rnn(BasicLSTMCell(self.hidden_size), BasicLSTMCell(self.hidden_size), inputs=batch_embedded, dtype=tf.float32) fw_outputs, bw_outputs = rnn_outputs W = tf.Variable(tf.random_normal([self.hidden_size], stddev=0.1)) H = fw_outputs + bw_outputs # (batch_size, seq_len, HIDDEN_SIZE) M = tf.tanh(H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE) self.alpha = tf.nn.softmax( tf.reshape( tf.matmul(tf.reshape(M, [-1, self.hidden_size]), tf.reshape(W, [-1, 1])), (-1, self.max_len))) # batch_size x seq_len r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, self.max_len, 1])) r = tf.squeeze(r) h_star = tf.tanh(r) # (batch , HIDDEN_SIZE h_drop = tf.nn.dropout(h_star, self.keep_prob) # Fully connected layer(dense layer) FC_W = tf.Variable( tf.truncated_normal([self.hidden_size, self.n_class], stddev=0.1)) FC_b = tf.Variable(tf.constant(0., shape=[self.n_class])) y_hat = tf.nn.xw_plus_b(h_drop, FC_W, FC_b) self.loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y_hat, labels=self.label)) # prediction self.prediction = tf.argmax(tf.nn.softmax(y_hat), 1) # optimization loss_to_minimize = self.loss tvars = tf.trainable_variables() gradients = tf.gradients( loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) grads, global_norm = tf.clip_by_global_norm(gradients, 1.0) self.global_step = tf.Variable(0, name="global_step", trainable=False) self.optimizer = tf.train.MomentumOptimizer(self.learning_rate, 0.9, use_nesterov=True) self.train_op = self.optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step, name='train_step') print("graph built successfully!")
def build_model(self): with tf.name_scope("ner_layer"): lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper( tf.nn.rnn_cell.LSTMCell(self.hidden_size), output_keep_prob=self.keep_prob) lstm_cell_bw = tf.nn.rnn_cell.DropoutWrapper( tf.nn.rnn_cell.LSTMCell(self.hidden_size), output_keep_prob=self.keep_prob) with tf.variable_scope("ner_layer", reuse=tf.AUTO_REUSE): # 直接建立多个网络堆叠再输出数据 for i in range(self.num_layers): (output_fw, output_bw), _ = bi_rnn( lstm_cell_fw, lstm_cell_bw, self.embedded_layer, sequence_length=self.sequence_lengths, dtype=tf.float32) # [batch_size, sequence_length, hidden_size * 2] # 取出所有的隐藏层输出,一般对于序列标注就是这样,如果是分类直接取最后一个隐藏状态就可以 self.outputs = tf.concat((output_fw, output_bw), 2) self.outputs = tf.nn.dropout(self.outputs, self.keep_prob) self.outputs = tf.reshape(self.outputs, [-1, 2 * self.hidden_size]) self.logits = tf.matmul( self.outputs, self.weight_variable) + self.bias_variable self.logits = tf.reshape( self.logits, [-1, self.io_sequence_size, self.output_class_size])
def cal_loss_logit(batch_embedded, keep_prob, reuse=True, scope="loss"): with tf.variable_scope(scope, reuse=reuse) as scope: rnn_outputs, _ = bi_rnn(BasicLSTMCell(HIDDEN_SIZE), BasicLSTMCell(HIDDEN_SIZE), inputs=batch_embedded, dtype=tf.float32) # Attention H = tf.add(rnn_outputs[0], rnn_outputs[1]) # fw + bw M = tf.tanh(H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE) print(M.shape) # alpha (bs * sl, 1) alpha = tf.nn.softmax( tf.matmul(tf.reshape(M, [-1, HIDDEN_SIZE]), tf.reshape(W, [-1, 1]))) r = tf.matmul( tf.transpose(H, [0, 2, 1]), tf.reshape(alpha, [-1, MAX_DOCUMENT_LENGTH, 1 ])) # supposed to be (batch_size * HIDDEN_SIZE, 1) print(r.shape) r = tf.squeeze(r) h_star = tf.tanh(r) # (batch , HIDDEN_SIZE # attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True) drop = tf.nn.dropout(h_star, keep_prob) # Fully connected layer(dense layer) y_hat = tf.nn.xw_plus_b(drop, W_fc, b_fc) return y_hat, tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=y_hat, labels=batch_y))
def model(self): voc_size = len(self.word2idx) + 1 pretrained_embed = load_pretrained_embed("CBOW_iter15_2017-2018.bin", self.embed_size, self.word2idx) embed_matrix = tf.get_variable( name='embedding_matrix', shape=[voc_size, self.embed_size], initializer=tf.constant_initializer(pretrained_embed), dtype=tf.float32) embed = tf.nn.embedding_lookup(embed_matrix, self.text) # RNN layer (fw_outputs, bw_outputs), _ = bi_rnn(GRUCell(self.hidden_size), GRUCell(self.hidden_size), inputs=embed, dtype=tf.float32) rnn_outputs = tf.concat((fw_outputs, bw_outputs), axis=2) # Attention layer attention_output, self.alpha = attention(rnn_outputs, self.attention_size) sentence_vector = tf.layers.dropout(attention_output, self.dropout_rate) self.logits = tf.layers.dense(inputs=sentence_vector, units=self.num_classes, name='logits')
def call(self, data, keep_prob=0.8): # data : [batch_size,data_length] max_sentence_length = data.shape[1] print('data.shape[1]:', data.shape[1]) with tf.variable_scope('embedding_layer'), tf.device("/cpu:0"): embedding = tf.get_variable( 'embedding', shape=[self.vocab_size, self.embed_size], initializer=tf.initializers.random_uniform(-1.0, 1.0)) tf.summary.histogram('embeddings_var', embedding) # w2v : [batch_size,max_sentence_length,embed_size] data = tf.cast(data, dtype=tf.int32) w2v = tf.nn.embedding_lookup(embedding, data) with tf.variable_scope('bilstm_layer'): # final_outputs is tuple final_outputs, final_state = bi_rnn(GRUCell(self.hidden_size), GRUCell(self.hidden_size), inputs=w2v, dtype=tf.float32) tf.summary.histogram('RNN_outputs', final_outputs) if self.sentence_mode == SentenceMode.ATTENTION: attention_ = Attention(final_outputs, self.attention_size, time_major=False, return_alphas=True) outputs, alphas = attention_.attentionModel() # outputs :[batc_size,vocab_size] tf.summary.histogram('alphas', alphas) elif self.sentence_mode == SentenceMode.FINAL_STATE: final_state_fw, final_state_bw = final_state # outputs = tf.concat([final_state_fw, final_state_bw], axis=-1) outputs = tf.concat(final_state, 2) else: raise ValueError("sentence mode `{0}` dose not " "supported on gru model.".format( self.sentence_mode)) with tf.variable_scope('fully_connected_layer'): # rnn_output = [batch_size,sentence_length] rnn_output = tf.nn.dropout(outputs, keep_prob=keep_prob) # h : [batch_size,sentence_length] print('rnn_output.shape:', rnn_output.shape) h = tf.layers.Dense(rnn_output.shape.as_list()[-1], activation=tf.nn.relu)(rnn_output) # h = tf.layers.Dense(64,activation=tf.nn.relu)(rnn_output) print('h.shape:', h.shape) # logits:[batch_size,num_targets] logits = tf.layers.Dense(self.num_targets)(h) print('logits.shape:', logits.shape) return logits
def bi_gru_embedding(self, batch_embedding): rnn_outputs, _ = bi_rnn(tf.contrib.rnn.GRUCell(self.lstm_units), tf.contrib.rnn.GRUCell(self.lstm_units), inputs=batch_embedding, dtype=tf.float32) fw_outputs, bw_outputs = rnn_outputs return tf.multiply(fw_outputs, bw_outputs)
def build_lstm(self, input_tensor): rnn_outputs, _ = bi_rnn( BasicLSTMCell(self.config["lstm_para"]["hidden_size"]), BasicLSTMCell(self.config["lstm_para"]["hidden_size"]), inputs=input_tensor, dtype=tf.float32) fw_outputs, bw_outputs = rnn_outputs W = tf.Variable( tf.random_normal([self.config["lstm_para"]["hidden_size"]], stddev=0.1)) H = fw_outputs + bw_outputs # (batch_size, seq_len, HIDDEN_SIZE) M = tf.tanh(H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE) alpha = tf.nn.softmax( tf.reshape( tf.matmul( tf.reshape(M, [-1, self.config["lstm_para"]["hidden_size"]]), tf.reshape(W, [-1, 1])), (-1, self.config["lstm_para"]["max_len"]))) # batch_size x seq_len r = tf.matmul( tf.transpose(H, [0, 2, 1]), tf.reshape(alpha, [-1, self.config["lstm_para"]["max_len"], 1])) r = tf.squeeze(r) h_star = tf.tanh(r) # (batch , HIDDEN_SIZE h_drop = tf.nn.dropout(h_star, self.config["lstm_para"]["keep_prob"]) # Fully connected layer(dense layer) FC_W = tf.Variable( tf.truncated_normal([ self.config["lstm_para"]["hidden_size"], self.config['label_size'] ], stddev=0.1)) FC_b = tf.Variable(tf.constant(0., shape=[self.config['label_size']])) logits = tf.nn.xw_plus_b(h_drop, FC_W, FC_b) # prediction probabilities = tf.argmax(tf.nn.softmax(logits), 1) predict_label_ids = tf.argmax(logits, axis=1, name="predict_label_id") # 预测结果 pooled_outputs = [] l2_loss = tf.constant(0.0) # 先不用,写0 l2_loss += tf.nn.l2_loss(FC_W) + tf.nn.l2_loss(FC_b) # with tf.variable_scope("output"): # output_w = tf.get_variable("output_w", shape=[hidden_size, self.config['label_size']]) # output_b = self.initialize_bias("output_b", shape=self.config['label_size']) # logits = tf.nn.xw_plus_b(output_layer, output_w, output_b) # # probabilities = tf.nn.softmax(logits, axis=-1) # predict_label_ids = tf.argmax(logits, axis=1, name="predict_label_id") # 预测结果 return logits, predict_label_ids, l2_loss, probabilities
def build_attention_model(): # Different placeholders with tf.name_scope('Inputs'): batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH], name='batch_ph') target_ph = tf.placeholder(tf.float32, [None], name='target_ph') seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph') keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph') # Embedding layer with tf.name_scope('Embedding_layer'): embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True) tf.summary.histogram('embeddings_var', embeddings_var) batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph) # (Bi-)RNN layer(-s) rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_UNITS), GRUCell(HIDDEN_UNITS), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas = attention(rnn_outputs, ATTENTION_UNITS, return_alphas=True) tf.summary.histogram('alphas', alphas) # Dropout drop = tf.nn.dropout(attention_output, keep_prob_ph) # Fully connected layer with tf.name_scope('Fully_connected_layer'): W = tf.Variable( tf.truncated_normal([HIDDEN_UNITS * 2, 1], stddev=0.1)) # Hidden size is multiplied by 2 for Bi-RNN b = tf.Variable(tf.constant(0., shape=[1])) y_hat = tf.nn.xw_plus_b(drop, W, b) y_hat = tf.squeeze(y_hat) tf.summary.histogram('W', W) with tf.name_scope('Metrics'): # Cross-entropy loss and optimizer initialization loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=target_ph)) tf.summary.scalar('loss', loss) optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss) # Accuracy metric accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), target_ph), tf.float32)) tf.summary.scalar('accuracy', accuracy) merged = tf.summary.merge_all() # Batch generators train_batch_generator = batch_generator(X_train, y_train, BATCH_SIZE) test_batch_generator = batch_generator(X_test, y_test, BATCH_SIZE) session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) saver = tf.train.Saver() return batch_ph, target_ph, seq_len_ph, keep_prob_ph, alphas, loss, accuracy, optimizer, merged, \ train_batch_generator, test_batch_generator, session_conf, saver
def __init__(self, num_classes, embedding_size, init_embed, hidden_size, \ attention_size, max_sent_len, keep_prob): # word index self.input_x = tf.placeholder(tf.int32, [None, max_sent_len], name="input_x") # output probability self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.sequence_length = tf.placeholder(tf.int32, [None], name="input_len") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") with tf.variable_scope('style_discriminator'): # embedding layer with initialization with tf.name_scope("embedding"): # trainable embedding W = tf.Variable(init_embed, name="W", dtype=tf.float32) self.embedded_chars = tf.nn.embedding_lookup(W, self.input_x) # RNN layer + attention with tf.name_scope("bi-rnn"): rnn_outputs, _ = bi_rnn(GRUCell(hidden_size), GRUCell(hidden_size),\ inputs=self.embedded_chars, sequence_length=self.sequence_length, \ dtype=tf.float32) attention_outputs, self.alphas = attention(rnn_outputs, attention_size, return_alphas=True) drop_outputs = tf.nn.dropout(attention_outputs, keep_prob) # Fully connected layer with tf.name_scope("fc-layer"): W = tf.Variable(tf.truncated_normal( [drop_outputs.get_shape()[1].value, num_classes], stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="b") self.scores = tf.sigmoid(tf.nn.xw_plus_b(drop_outputs, W, b), name="scores") # mean square error with tf.name_scope("mse"): self.loss = tf.reduce_mean( tf.square(tf.subtract(self.scores, self.input_y))) self.params = [ param for param in tf.trainable_variables() if 'style_discriminator' in param.name ] sd_optimizer = tf.train.AdamOptimizer(1e-4) grads_and_vars = sd_optimizer.compute_gradients(self.loss, self.params, aggregation_method=2) self.train_op = sd_optimizer.apply_gradients(grads_and_vars)
def __init__(self, sequence_length, num_classes, channel_num, rnn_hidden_size, attention_size): self.input_x = tf.placeholder(tf.float32, [None, sequence_length, channel_num], name="input_x") self.input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # Bidirectional RNN self.rnn_outputs, _ = bi_rnn(GRUCell(rnn_hidden_size), GRUCell(rnn_hidden_size), inputs=self.input_x, dtype=tf.float32) # Attention layer and a dropout layer with tf.name_scope('Attention_layer'): self.att_output, alphas = self.attention( inputs=self.rnn_outputs, attention_size=attention_size) tf.summary.histogram('alphas', alphas) with tf.name_scope("dropout"): self.att_drop = tf.nn.dropout(self.att_output, self.dropout_keep_prob, name="dropout") # FC layer with tf.name_scope("output"): # FC_W = tf.get_variable("FC_W", shape=[rnn_hidden_size * 2, num_classes], # initializer=tf.contrib.layers.xavier_initializer()) FC_W = tf.get_variable( "FC_W", shape=[sequence_length * rnn_hidden_size * 2, num_classes], initializer=tf.contrib.layers.xavier_initializer()) FC_b = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="FC_b") self.fc_out = tf.nn.xw_plus_b(self.att_drop, FC_W, FC_b, name="FC_out") self.scores = tf.nn.softmax(self.fc_out, name='scores') self.predictions = tf.argmax(self.scores, 1, name="predictions") with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.fc_out, labels=self.input_y) self.loss = tf.reduce_mean(losses, name='loss') with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def __call__(self, batch_embedding): if self.basic_cell in self.cell_dic: rnn_outputs, _ = bi_rnn( self.cell_dic[self.basic_cell](self.hidden_size), self.cell_dic[self.basic_cell](self.hidden_size), inputs=batch_embedding, dtype=tf.float32) print("Rnn encoder with " + self.basic_cell) else: rnn_outputs, _ = bi_rnn(GRUCell(self.hidden_size), GRUCell(self.hidden_size), inputs=batch_embedding, dtype=tf.float32) print("Rnn encoder with default GRU cell") if self.keep_ori: return tf.concat([rnn_outputs[0], rnn_outputs[1]], axis=-1) if self.with_attention_layer: print("Build a Self-Attention Layer") return attention(rnn_outputs, self.keep_prob), self.hidden_size return tf.reduce_mean(rnn_outputs[0] + rnn_outputs[1], 1), self.hidden_size
def _build_graph(self): config = self.config # 定义双向rnn rnn_outputs, _ = bi_rnn( self.rnn_cell(config), self.rnn_cell(config), inputs=self.batch_embedded, dtype=tf.float32, ) fw_outputs, bw_outputs = rnn_outputs W = tf.Variable(tf.random_normal([config.hidden_size], stddev=0.1)) H = fw_outputs + bw_outputs # (batch_size, seq_len, HIDDEN_SIZE) M = tf.tanh(H) # M = tanh(H) (batch_size, seq_len, HIDDEN_SIZE) alpha = tf.nn.softmax( tf.matmul(tf.reshape(M, [-1, config.hidden_size]), tf.reshape(W, [-1, 1]))) r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(alpha, [-1, config.max_len, 1])) r = tf.squeeze(r) h_star = tf.tanh(r) # (batch , HIDDEN_SIZE h_drop = tf.nn.dropout(h_star, self.keep_prob) # Fully connected layer(dense layer) FC_W = tf.Variable( tf.truncated_normal([config.hidden_size, config.n_class], stddev=0.1)) FC_b = tf.Variable(tf.constant(0., shape=[config.n_class])) self.y_hat = tf.nn.xw_plus_b(h_drop, FC_W, FC_b) # 定义loss 和 train_op self.loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.y_hat, labels=self.label)) # optimization loss_to_minimize = self.loss tvars = tf.trainable_variables() gradients = tf.gradients( loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE, ) grads, global_norm = tf.clip_by_global_norm(gradients, 1.0) optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step, name="train_step")
def model(self): # (Bi-GRU) layers rnn_outputs, _ = bi_rnn(GRUCell(self.hidden_size), GRUCell(self.hidden_size), inputs=self.batch_embedded, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) if isinstance(rnn_outputs, tuple): rnn_outputs = tf.concat(rnn_outputs, 2) print('rnn_outputs.shape:', rnn_outputs.shape) rnn_outputs = tf.reduce_mean(rnn_outputs, axis=2) print('rnn_outputs.shape:', rnn_outputs.shape) self.output = tf.reduce_sum(rnn_outputs, axis=1)
def RNN_layer(HIDDEN_SIZE, batch_embedded, seq_len_ph): rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) if isinstance(rnn_outputs, tuple): rnn_outputs = tf.concat(rnn_outputs, 2) #rnn_outputs = tf.layers.batch_normalization(rnn_outputs) return rnn_outputs
def create_model(self): with tf.name_scope("classification_rnn"): outputs, _ = bi_rnn( tf.nn.rnn_cell.DropoutWrapper(GRUCell(self.hidden_size), self.keep_prob), tf.nn.rnn_cell.DropoutWrapper(GRUCell(self.hidden_size), self.keep_prob), inputs=self.embedded_layer, dtype=tf.float32) outputs = tf.concat(outputs, axis=-1) outputs = tf.reduce_mean(outputs, axis=1) self.drop = tf.nn.dropout(outputs, self.keep_prob) self.logits = tf.matmul(self.drop, self.weight_variable) + self.bias_variable # [batch_size,num_classes] self.prediction = tf.nn.softmax(self.logits) self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)
def biLSTM_layer(self, lstm_inputs, lstm_dim, lengths, name=None): """ :param lstm_inputs: [batch_size, num_steps, emb_size] :return: [batch_size, num_steps, 2*lstm_dim] """ with tf.variable_scope("char_BiLSTM" if not name else name): outputs, _ = bi_rnn( tf.nn.rnn_cell.DropoutWrapper(GRUCell(lstm_dim), self.dropout_keep), tf.nn.rnn_cell.DropoutWrapper(GRUCell(lstm_dim), self.dropout_keep), inputs=lstm_inputs, dtype=tf.float32, sequence_length=lengths) return tf.concat(outputs, axis=2)
def __init__(self): learning_rate = 0.01 num_hidden = 5 num_classes = 1 num_input = 6 keep_rate_DROPOUT = 1 self.X = tf.placeholder("float", [1, None, num_input]) self.Y = tf.placeholder("float", [None, num_classes]) # Define weights self.weights = { 'out': tf.Variable(tf.random_normal([num_hidden, num_classes])) } self.biases = { 'out': tf.Variable(tf.random_normal([num_classes])) } self.lstm_fw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0,reuse=False) self.lstm_fw_cell = rnn.DropoutWrapper(self.lstm_fw_cell,input_keep_prob=keep_rate_DROPOUT, output_keep_prob=keep_rate_DROPOUT, state_keep_prob=keep_rate_DROPOUT) # Backward direction cell self.lstm_bw_cell = rnn.BasicLSTMCell(num_hidden, forget_bias=1.0,reuse=False) self.lstm_bw_cell = rnn.DropoutWrapper(self.lstm_bw_cell,input_keep_prob=keep_rate_DROPOUT, output_keep_prob=keep_rate_DROPOUT, state_keep_prob=keep_rate_DROPOUT) # Get lstm cell output self.LSTM_outputs, _ = bi_rnn(self.lstm_fw_cell, self.lstm_bw_cell, self.X, dtype=tf.float32,scope="bidirectional_rnn") #Attention fw_outputs, bw_outputs = self.LSTM_outputs #fw_o and bw_o :(batch_size,windowSize,num_hidden)(76x412x5) Hidden_fw_bw = fw_outputs+bw_outputs # (batch_size,windowSize,num_hidden) Hidden_fw_bw_t = tf.transpose(Hidden_fw_bw,[0,2,1]) # Doi vi tri cot 2 cho 1: (batch_size,num_hidden,windowSize) Hidden_fw_bw_2D = tf.reshape(Hidden_fw_bw,[-1,num_hidden]) # (batch_size*windowSize,num_hidden) M=tf.tanh(Hidden_fw_bw_2D) #(batch_size*windowSize,num_hidden) W= tf.Variable(tf.random_normal([num_hidden])) #(1,num_hidden) W_t=tf.reshape(W,[-1,1]) #(num_hidden,1) MxW = tf.matmul(M,W_t) #(batch_size*windowSize,num_hidden)*(num_hidden,1) = (batch_size*windowSize,1) MxW_3D = tf.reshape(MxW,[1,-1,1]) #(batch_size,windowSize,1) self.alpha_3D = tf.nn.softmax(MxW_3D,axis=1) # (batch_size,windowSize,1) r=tf.matmul(Hidden_fw_bw_t,self.alpha_3D) #(batch_size,num_hidden,windowSize)* (batch_size,windowSize,1) = (batch_size,num_hidden,1) r= tf.reshape(r,[-1,num_hidden]) # (batch_size,num_hidden) self.h_star = tf.tanh(r) # (batch_size,num_hidden) prediction = tf.matmul(self.h_star, self.weights['out']) + self.biases['out'] self.Prediction_MOS = tf.add(tf.multiply(prediction, 4), 1) self.Label_MOS = tf.add(tf.multiply(self.Y, 4), 1) LOSS = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(self.Prediction_MOS, self.Label_MOS)))) PCC = tf.contrib.metrics.streaming_pearson_correlation(labels=self.Prediction_MOS, predictions=self.Label_MOS, name='pearson_r') optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) train_op = optimizer.minimize(LOSS) init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) self.saverModel = tf.train.Saver()
def model(self): # (Bi-GRU) layers rnn_outputs, _ = bi_rnn(GRUCell(self.hidden_size), GRUCell(self.hidden_size), inputs=self.batch_embedded, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) # Attention layers with tf.name_scope('Attention_layer'): attention_ = Attention(rnn_outputs, self.attention_size, time_major=False, return_alphas=True) self.attention_output, alphas = attention_.attentionModel() tf.summary.histogram('alphas', alphas) print('attention_output.shape:', self.attention_output.shape)
def bi_lstm_att(self): sen_inputs_glove = self.embedding_layer() rnn_outputs, _ = bi_rnn( tf.contrib.rnn.BasicLSTMCell(self.lstm_units), tf.contrib.rnn.BasicLSTMCell(self.lstm_units), inputs=sen_inputs_glove, dtype=tf.float32 ) fw_outputs, bw_outputs = rnn_outputs W = tf.Variable(tf.random_normal([self.lstm_units], stddev=0.1)) H = fw_outputs + bw_outputs # [b_s, max_len, lstm_units] M = tf.tanh(H) # M = tanh(H) (batch_size, seq_len, lstm_units) self.comput_att(H) r = tf.matmul( tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, self.max_len, 1]) ) r = tf.squeeze(r) h_star = tf.tanh(r) h_drop = tf.nn.dropout(h_star, self.dropout_keep_prob) return h_drop
def __init__(self, sequence_length, num_classes, text_vocab_size, text_embedding_size, hidden_size=800, attention_size=100, l2_reg_lambda=0.0): # Placeholders for input, output and dropout self.input_text = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_text') self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y') self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob') self.dropout_keep_prob_lstm = tf.placeholder(tf.float32, name='dropout_keep_prob') # Keeping track of l2 regularization loss (optional) l2_loss = tf.constant(0.0) # Embedding layer with tf.device('/cpu:0'), tf.name_scope("text-embedding"): self.W_text = tf.Variable(tf.random_uniform( [text_vocab_size, text_embedding_size], -1.0, 1.0), name="W_text") self.text_embedded_chars = tf.nn.embedding_lookup( self.W_text, self.input_text) # embedding_size = text_embedding_size + 2 * dist_embedding_size # (Bi-)RNN layer(-s) self.rnn_outputs, _ = bi_rnn( tf.nn.rnn_cell.DropoutWrapper(GRUCell(hidden_size), self.dropout_keep_prob_lstm), tf.nn.rnn_cell.DropoutWrapper(GRUCell(hidden_size), self.dropout_keep_prob_lstm), inputs=self.text_embedded_chars, dtype=tf.float32) print(self.rnn_outputs) tf.summary.histogram('RNN_outputs', self.rnn_outputs) # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas, self.vu = attention(self.rnn_outputs, attention_size, return_alphas=True) tf.summary.histogram('alphas', alphas) print(attention_output) # Dropout self.drop = tf.nn.dropout(attention_output, self.dropout_keep_prob) # Fully connected layer with tf.name_scope('Fully_connected_layer'): W = tf.Variable( tf.truncated_normal( [hidden_size * 2, num_classes], stddev=0.1)) # Hidden size is multiplied by 2 for Bi-RNN b = tf.Variable(tf.constant(0., shape=[num_classes])) l2_loss += tf.nn.l2_loss(W) l2_loss += tf.nn.l2_loss(b) self.scores = tf.nn.xw_plus_b(self.drop, W, b, name="scores") self.predictions = tf.argmax(self.scores, 1, name="predictions") # Calculate mean cross-entropy loss with tf.name_scope("loss"): losses = tf.nn.softmax_cross_entropy_with_logits( logits=self.scores, labels=self.input_y) self.loss = tf.reduce_mean(losses) + l2_reg_lambda * l2_loss # Accuracy with tf.name_scope("accuracy"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
Train_Set, Valid_Set, Test_Set = PMATM.getSplitSets() Pos_Txt_Index_List = list(np.load(Pos_Txt_Index_List_Path)) Neg_Txt_Index_List = list(np.load(Neg_Txt_Index_List_Path)) tf.reset_default_graph() labels = tf.placeholder(tf.float32, [batchSize, numClasses]) input_text = tf.placeholder(tf.float32, [batchSize, maxSeqLength, wordDim]) input_emoji = tf.placeholder(tf.float32, [batchSize, wordDim]) # (Bi-)RNN layer(-s) seq_len_ph = [] for i in range(batchSize): seq_len_ph.append(maxSeqLength) rnn_outputs, _ = bi_rnn(GRUCell(hiddenSize), GRUCell(hiddenSize), inputs=input_text, sequence_length=seq_len_ph, dtype=tf.float32) memory = tf.concat(rnn_outputs, 2) attention_input_1 = tf.reduce_mean(input_text, axis=1) def attention(memory, input): input = tf.reshape(input, [batchSize, 1, wordDim]) inputs = input for i in range(memory.shape[1] - 1): inputs = tf.concat((inputs, input), 1)
def _build_graph(self): now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print(now) print("Build Graph...") print() self.xavier_init = tf.contrib.layers.xavier_initializer() self.embed_dim = 100 self.state_dim = 100 self.bi_state_dim = self.state_dim * 2 self.feat_dim = self.bi_state_dim self.attend_dim = self.feat_dim self.context_dim = self.bi_state_dim * 4 self.fc_dim = 250 print("embed_dim : %d" % self.embed_dim) print("state_dim : %d" % self.state_dim) print("bi_state_dim : %d" % self.bi_state_dim) print("feat_dim : %d" % self.feat_dim) print("attend_dim : %d" % self.attend_dim) print("context_dim : %d" % self.context_dim) print("fc_dim : %d" % self.fc_dim) print() with tf.device(self.dev): with tf.variable_scope("input_placeholders"): self.enc_input1 = tf.placeholder(tf.int32, shape=[None, None], name="enc_input1") self.enc_seq_len1 = tf.placeholder(tf.int32, shape=[ None, ], name="enc_seq_len1") self.enc_input2 = tf.placeholder(tf.int32, shape=[None, None], name="enc_input2") self.enc_seq_len2 = tf.placeholder(tf.int32, shape=[ None, ], name="enc_seq_len2") self.targets = tf.placeholder(tf.int32, shape=[ None, ], name="targets") self.batch_size = tf.placeholder(tf.int32, shape=[], name="batch_size") self.keep_prob = tf.placeholder(tf.float32, name="keep_prob") with tf.variable_scope("words_embedding"): self.embeddings = tf.get_variable( "embeddings", [self.voc_size, self.embed_dim], initializer=self.xavier_init) self.embed_in1 = tf.nn.embedding_lookup(self.embeddings, self.enc_input1, name="embed_in1") self.embed_in2 = tf.nn.embedding_lookup(self.embeddings, self.enc_input2, name="embed_in2") self.pad_mask1 = tf.sequence_mask(self.enc_seq_len1, self.input_len_max, dtype=tf.float32, name="pad_mask1") self.pad_mask2 = tf.sequence_mask(self.enc_seq_len2, self.input_len_max, dtype=tf.float32, name="pad_mask2") with tf.variable_scope("rnn_encoder_layer") as scope_rnn: self.output_enc1, self.state_enc1 = bi_rnn( GRUCell(self.state_dim), GRUCell(self.state_dim), inputs=self.embed_in1, sequence_length=self.enc_seq_len1, dtype=tf.float32) self.state_enc1 = tf.concat( [self.state_enc1[0], self.state_enc1[1]], axis=1, name="state_enc1") assert self.state_enc1.get_shape()[1] == self.bi_state_dim self.output_enc1 = tf.concat( self.output_enc1, axis=2) # [batch, max_eng, state*2] self.output_enc1 = tf.nn.dropout(self.output_enc1, keep_prob=self.keep_prob, name="output_enc1") print("output_enc1.get_shape() : %s" % (self.output_enc1.get_shape())) assert self.output_enc1.get_shape()[2] == self.bi_state_dim scope_rnn.reuse_variables() self.output_enc2, self.state_enc2 = bi_rnn( GRUCell(self.state_dim), GRUCell(self.state_dim), inputs=self.embed_in2, sequence_length=self.enc_seq_len2, dtype=tf.float32) self.state_enc2 = tf.concat( [self.state_enc2[0], self.state_enc2[1]], axis=1, name="state_enc2") assert self.state_enc2.get_shape()[1] == self.bi_state_dim self.output_enc2 = tf.concat( self.output_enc2, axis=2) # [batch, max_eng, state*2] self.output_enc2 = tf.nn.dropout(self.output_enc2, keep_prob=self.keep_prob, name="output_enc2") print("output_enc2.get_shape() : %s" % (self.output_enc2.get_shape())) assert self.output_enc2.get_shape()[2] == self.bi_state_dim with tf.variable_scope("attention_layer") as scope_attention: self.W_y = tf.get_variable( "W_y", [1, 1, self.feat_dim, self.attend_dim], initializer=self.xavier_init) self.W_h = tf.get_variable("W_h", [self.feat_dim, self.attend_dim], initializer=self.xavier_init) self.W_a = tf.get_variable("W_a", [self.attend_dim, 1], initializer=self.xavier_init) # question 1.. # average vector self.R_ave_1 = tf.reduce_mean(self.output_enc1, axis=1, name="R_ave_1") print("R_ave_1.get_shape() : %s" % (self.R_ave_1.get_shape())) # Wy * Y self.output_enc1_ex = tf.reshape( self.output_enc1, [-1, self.input_len_max, 1, self.feat_dim]) self.M_1_left = tf.nn.conv2d(self.output_enc1_ex, self.W_y, strides=[1, 1, 1, 1], padding="SAME") self.M_1_left = tf.reshape( self.M_1_left, [-1, self.input_len_max, self.attend_dim]) print("M_1_left.get_shape() : %s" % (self.M_1_left.get_shape())) # Wh * Rave self.M_1_right = tf.matmul(self.R_ave_1, self.W_h) self.M_1_right = tf.ones([self.input_len_max, 1, 1 ]) * self.M_1_right self.M_1_right = tf.transpose(self.M_1_right, [1, 0, 2]) print("M_1_right.get_shape() : %s" % (self.M_1_right.get_shape())) # attention self.M_1 = tf.tanh(self.M_1_left + self.M_1_right) print("M_1.get_shape() : %s" % (self.M_1.get_shape())) self.w_M_1 = tf.matmul( tf.reshape(self.M_1, [-1, self.attend_dim]), self.W_a) self.w_M_1 = tf.reshape(self.w_M_1, [-1, self.input_len_max]) print("w_M_1.get_shape() : %s" % (self.w_M_1.get_shape())) self.attention1 = tf.nn.softmax(self.w_M_1) * self.pad_mask1 self.attention1 = self.attention1 / tf.reshape( tf.reduce_sum(self.attention1, axis=1), [-1, 1]) print("attention1.get_shape() : %s" % (self.attention1.get_shape())) self.context1 = tf.reduce_sum( self.output_enc1 * tf.reshape(self.attention1, [-1, self.input_len_max, 1]), axis=1, name="context1") print("context1.get_shape() : %s" % (self.context1.get_shape())) # question 2.. # average vector self.R_ave_2 = tf.reduce_mean(self.output_enc2, axis=1, name="R_ave_2") print("R_ave_2.get_shape() : %s" % (self.R_ave_2.get_shape())) # Wy * Y self.output_enc2_ex = tf.reshape( self.output_enc2, [-1, self.input_len_max, 1, self.feat_dim]) self.M_2_left = tf.nn.conv2d(self.output_enc2_ex, self.W_y, strides=[1, 1, 1, 1], padding="SAME") self.M_2_left = tf.reshape( self.M_2_left, [-1, self.input_len_max, self.attend_dim]) print("M_2_left.get_shape() : %s" % (self.M_2_left.get_shape())) # Wh * Rave self.M_2_right = tf.matmul(self.R_ave_2, self.W_h) self.M_2_right = tf.ones([self.input_len_max, 1, 1 ]) * self.M_2_right self.M_2_right = tf.transpose(self.M_2_right, [1, 0, 2]) print("M_2_right.get_shape() : %s" % (self.M_2_right.get_shape())) # attention self.M_2 = tf.tanh(self.M_2_left + self.M_2_right) print("M_2.get_shape() : %s" % (self.M_2.get_shape())) self.w_M_2 = tf.matmul( tf.reshape(self.M_2, [-1, self.attend_dim]), self.W_a) self.w_M_2 = tf.reshape(self.w_M_2, [-1, self.input_len_max]) print("w_M_2.get_shape() : %s" % (self.w_M_2.get_shape())) self.attention2 = tf.nn.softmax(self.w_M_2) * self.pad_mask2 self.attention2 = self.attention2 / tf.reshape( tf.reduce_sum(self.attention2, axis=1), [-1, 1]) print("attention2.get_shape() : %s" % (self.attention2.get_shape())) self.context2 = tf.reduce_sum( self.output_enc2 * tf.reshape(self.attention2, [-1, self.input_len_max, 1]), axis=1, name="context2") print("context2.get_shape() : %s" % (self.context2.get_shape())) assert self.context1.get_shape()[1] == self.feat_dim assert self.context2.get_shape()[1] == self.feat_dim with tf.variable_scope("final_context_layer"): self.features = [ self.context1, self.context2, tf.abs(self.context1 - self.context2), (self.context1 * self.context2) ] self.merged_feature = tf.concat(self.features, axis=1, name="merged_feature") print("merged_feature.get_shape() : %s" % (self.merged_feature.get_shape())) assert self.merged_feature.get_shape()[1] == self.context_dim with tf.variable_scope("dense_layer"): self.W_out1 = tf.get_variable("W_out1", [self.context_dim, self.fc_dim], initializer=self.xavier_init) self.bias_out1 = tf.get_variable("bias_out1", [self.fc_dim]) self.W_out2 = tf.get_variable("W_out2", [self.fc_dim, self.target_size], initializer=self.xavier_init) self.bias_out2 = tf.get_variable("bias_out2", [self.target_size]) self.fc = tf.nn.xw_plus_b(self.merged_feature, self.W_out1, self.bias_out1) self.fc = tf.tanh(self.fc) print("fc.get_shape() : %s" % (self.fc.get_shape())) self.y_hat = tf.nn.xw_plus_b(self.fc, self.W_out2, self.bias_out2, name="y_hat") print("y_hat.get_shape() : %s" % (self.y_hat.get_shape())) with tf.variable_scope("train_optimization"): self.train_vars = tf.trainable_variables() print() print("trainable_variables") for varvar in self.train_vars: print(varvar) print() self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.y_hat, labels=self.targets) self.loss = tf.reduce_mean(self.loss, name="loss") self.loss_l2 = tf.add_n([ tf.nn.l2_loss(v) for v in self.train_vars if "bias" not in v.name ]) * 0.0001 self.loss = self.loss + self.loss_l2 self.predict = tf.argmax(tf.nn.softmax(self.y_hat), 1) self.predict = tf.cast(tf.reshape(self.predict, [self.batch_size, 1]), tf.int32, name="predict") self.target_label = tf.cast( tf.reshape(self.targets, [self.batch_size, 1]), tf.int32) self.correct = tf.equal(self.predict, self.target_label) self.accuracy = tf.reduce_mean( tf.cast(self.correct, tf.float32)) self.global_step = tf.Variable(0, name="global_step", trainable=False) self.decay_rate = tf.maximum(0.00007, tf.train.exponential_decay( self.lr, self.global_step, 1500, 0.95, staircase=True), name="decay_rate") self.opt = tf.train.AdamOptimizer( learning_rate=self.decay_rate) self.grads_and_vars = self.opt.compute_gradients( self.loss, self.train_vars) self.grads_and_vars = [(tf.clip_by_norm(g, 30.0), v) for g, v in self.grads_and_vars] self.grads_and_vars = [ (tf.add(g, tf.random_normal(tf.shape(g), stddev=0.001)), v) for g, v in self.grads_and_vars ] self.train_op = self.opt.apply_gradients( self.grads_and_vars, global_step=self.global_step, name="train_op") if self.makedir == True: # Summaries for loss and lr self.loss_summary = tf.summary.scalar("loss", self.loss) self.accuracy_summary = tf.summary.scalar( "accuracy", self.accuracy) self.lr_summary = tf.summary.scalar("lr", self.decay_rate) # Output directory for models and summaries timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") self.out_dir = os.path.abspath( os.path.join("./model", timestamp)) print("LOGDIR = %s" % self.out_dir) print() # Train Summaries self.train_summary_op = tf.summary.merge([ self.loss_summary, self.accuracy_summary, self.lr_summary ]) self.train_summary_dir = os.path.join(self.out_dir, "summary", "train") self.train_summary_writer = tf.summary.FileWriter( self.train_summary_dir, self.sess.graph) # Test summaries self.test_summary_op = tf.summary.merge([ self.loss_summary, self.accuracy_summary, self.lr_summary ]) print(self.test_summary_op) self.test_summary_dir = os.path.join(self.out_dir, "summary", "test") self.test_summary_writer = tf.summary.FileWriter( self.test_summary_dir, self.sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it self.checkpoint_dir = os.path.abspath( os.path.join(self.out_dir, "checkpoints")) self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "model-step") if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None)
def __graph__(): with tf.name_scope('input'): x_input = tf.placeholder( dtype=tf.float32, shape=[None, sequence_width, sequence_height], name='x_input') y_input = tf.placeholder(dtype=tf.float32, shape=[None, num_classes], name='y_input') # state = tf.placeholder(dtype=tf.float32, shape=[None, self.cell_size * self.num_layers], # name='initial_state') p_keep = tf.placeholder(dtype=tf.float32, name='p_keep') learning_rate = tf.placeholder(dtype=tf.float32, name='learning_rate') hidden_size = int(sequence_width) # seq_len = tf.Variable(tf.constant(hidden_size),name='seq_len') rnn_outputs, _ = bi_rnn(GRUCell(hidden_size), GRUCell(hidden_size), inputs=x_input, sequence_length=None, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas = attention( input=rnn_outputs, hidden_size=self.sequence_width, attention_size=ATTENTION_SIZE, return_alpha=True) tf.summary.histogram('alphas', alphas) # dropout drop = tf.nn.dropout(attention_output, keep_prob=p_keep) # fully connected layer with tf.name_scope('Fully_connected_layer'): W = tf.Variable(tf.truncated_normal( [hidden_size * 2, self.num_classes], stddev=0.1), name='W') b = tf.Variable(tf.constant(0.0, shape=[self.num_classes]), name='b') y_hat = tf.nn.xw_plus_b(drop, W, b) # y_hat=tf.squeeze(y_hat) tf.summary.histogram('W', W) with tf.name_scope('loss'): loss = svm_loss(labels=y_input, logits=y_hat, num_classes=self.num_classes, penalty_parameter=self.svm_c, weight=W) tf.summary.scalar('loss', loss) optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(loss=loss) with tf.name_scope('accuracy'): predicted_class = tf.sign(y_hat) predicted_class = tf.identity(predicted_class, name='predicted_class') with tf.name_scope('correct_prediction'): correct = tf.equal(tf.argmax(predicted_class, 1), tf.argmax(y_input, 1)) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct, 'float')) tf.summary.scalar('accuracy', accuracy) merged = tf.summary.merge_all() # set class properties self.x_input = x_input self.y_input = y_input self.p_keep = p_keep self.loss = loss self.optimizer = optimizer # self.state=state # self.states=states self.learning_rate = learning_rate self.predicted_class = predicted_class self.accuracy = accuracy self.merged = merged
with tf.name_scope('Input_layer'): input_x = tf.placeholder(tf.int32, [None, maxlen], name='input_x') output_y = tf.placeholder(tf.float32, [None], name='output_y') keep_prob = tf.placeholder(tf.float32, name='keep_prob') # Embedding layer with tf.name_scope('Embedding_layer'): embeddings_var = tf.Variable(tf.random_uniform( [len(word_index) + 1, embedding_dim], -1.0, 1.0), trainable=True) tf.summary.histogram('embeddings_var', embeddings_var) batch_embedded = tf.nn.embedding_lookup(embeddings_var, input_x) # BiDirectional RNN Layer rnn_outputs, _ = bi_rnn(GRUCell(hidden_size), GRUCell(hidden_size), inputs=batch_embedded, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas = attention(rnn_outputs, attention_size, return_alphas=True) tf.summary.histogram('alphas', alphas) # Dropout for attention layer drop = tf.nn.dropout(attention_output, keep_prob) # Fully connected layer with tf.name_scope('Fully_connected_layer'):
def _build_graph(self): now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") print(now) print("Build Graph...") print() self.xavier_init = tf.contrib.layers.xavier_initializer() self.embed_dim = 100 self.state_dim = 100 self.bi_state_dim = self.state_dim * 2 self.attend_dim = 250 self.feat_dim = self.bi_state_dim self.fc_dim = 150 print("embed_dim : %d" % self.embed_dim) print("state_dim : %d" % self.state_dim) print("bi_state_dim : %d" % self.bi_state_dim) print("attend_dim : %d" % self.attend_dim) print("feat_dim : %d" % self.feat_dim) print("fc_dim : %d" % self.fc_dim) print() with tf.device(self.dev): with tf.variable_scope("input_placeholders"): self.enc_input = tf.placeholder(tf.int32, shape=[None, None], name="enc_input") self.enc_seq_len = tf.placeholder(tf.int32, shape=[ None, ], name="enc_seq_len") self.targets = tf.placeholder(tf.int32, shape=[ None, ], name="targets") self.batch_size = tf.placeholder(tf.int32, shape=[], name="batch_size") self.keep_prob = tf.placeholder(tf.float32, name="keep_prob") with tf.variable_scope("words_embedding"): self.embeddings = tf.get_variable( "embeddings", [self.voc_size, self.embed_dim], initializer=self.xavier_init) self.embed_in = tf.nn.embedding_lookup(self.embeddings, self.enc_input, name="embed_in") self.pad_mask = tf.sequence_mask(self.enc_seq_len, self.input_len_max, dtype=tf.float32, name="pad_mask1") with tf.variable_scope("rnn_encoder_layer"): self.output_enc, self.state_enc = bi_rnn( GRUCell(self.state_dim), GRUCell(self.state_dim), inputs=self.embed_in, sequence_length=self.enc_seq_len, dtype=tf.float32) self.state_enc = tf.concat( [self.state_enc[0], self.state_enc[1]], axis=1, name="state_enc1") assert self.state_enc.get_shape()[1] == self.bi_state_dim self.output_enc = tf.concat( self.output_enc, axis=2) # [batch, max_eng, state*2] self.output_enc = tf.nn.dropout(self.output_enc, keep_prob=self.keep_prob, name="output_enc1") print("output_enc.get_shape() : %s" % (self.output_enc.get_shape())) assert self.output_enc.get_shape()[2] == self.bi_state_dim with tf.variable_scope("attention_layer"): self.rows = 30 self.W_s1 = tf.get_variable( "W_s1", [1, 1, self.feat_dim, self.attend_dim], initializer=self.xavier_init) self.bias_s1 = tf.get_variable("bias_s1", [self.attend_dim]) self.W_s2 = tf.get_variable("W_s2", [self.attend_dim, self.rows], initializer=self.xavier_init) self.identity = tf.reshape( tf.tile(tf.diag(tf.ones(self.rows)), [self.batch_size, 1]), [self.batch_size, self.rows, self.rows], name="identity") self.output_enc_ex = tf.reshape( self.output_enc, [-1, self.input_len_max, 1, self.feat_dim]) self.context_att = tf.nn.conv2d(self.output_enc_ex, self.W_s1, strides=[1, 1, 1, 1], padding="SAME") self.context_att = tf.tanh(tf.nn.bias_add( self.context_att, self.bias_s1), name="context_att") print("context_att.get_shape() : %s" % (self.context_att.get_shape())) # attention self.attention_tot = tf.matmul( tf.reshape(self.context_att, [-1, self.attend_dim]), self.W_s2) self.attention_tot = tf.reshape( self.attention_tot, [-1, self.input_len_max, self.rows]) self.attention_tot = tf.nn.softmax( self.attention_tot, dim=1) * tf.reshape( self.pad_mask, [-1, self.input_len_max, 1]) self.attention_tot = tf.nn.softmax(self.attention_tot, dim=1) print("attention_tot.get_shape() : %s" % (self.attention_tot.get_shape())) self.attention = tf.reduce_sum(self.attention_tot, axis=2) self.attention = tf.reshape( self.attention, [self.batch_size, self.input_len_max]) * self.pad_mask self.attention = tf.nn.softmax(self.attention) print("attention.get_shape() : %s" % (self.attention.get_shape())) self.attention_tot_T = tf.transpose(self.attention_tot, [0, 2, 1], name="attention_tot_T") self.AA_t = tf.matmul(self.attention_tot_T, self.attention_tot) - self.identity print("AA_t.get_shape() : %s" % (self.AA_t.get_shape())) # penalty self.P = tf.square(tf.norm(self.AA_t, axis=[-2, -1], ord="fro")) self.P = tf.reduce_mean(self.P, name="P") # context.. self.context = tf.reduce_sum( self.output_enc * tf.reshape(self.attention, [-1, self.input_len_max, 1]), axis=1, name="context") print("context.get_shape() : %s" % (self.context.get_shape())) assert self.context.get_shape()[1] == self.feat_dim with tf.variable_scope("dense_layer"): self.W_out1 = tf.get_variable("W_out1", [self.feat_dim, self.fc_dim], initializer=self.xavier_init) self.bias_out1 = tf.get_variable("bias_out1", [self.fc_dim]) self.W_out2 = tf.get_variable("W_out2", [self.fc_dim, self.target_size], initializer=self.xavier_init) self.bias_out2 = tf.get_variable("bias_out2", [self.target_size]) self.fc = tf.nn.xw_plus_b(self.context, self.W_out1, self.bias_out1) self.fc = tf.tanh(self.fc) print("fc.get_shape() : %s" % (self.fc.get_shape())) self.y_hat = tf.nn.xw_plus_b(self.fc, self.W_out2, self.bias_out2, name="y_hat") print("y_hat.get_shape() : %s" % (self.y_hat.get_shape())) with tf.variable_scope("train_optimization"): self.train_vars = tf.trainable_variables() print() print("trainable_variables") for varvar in self.train_vars: print(varvar) print() self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.y_hat, labels=self.targets) self.loss = tf.reduce_mean(self.loss, name="loss") self.loss_l2 = tf.add_n([ tf.nn.l2_loss(v) for v in self.train_vars if "bias" not in v.name ]) * 0.0001 self.loss = self.loss + self.loss_l2 + self.P self.predict = tf.argmax(tf.nn.softmax(self.y_hat), 1) self.predict = tf.cast(tf.reshape(self.predict, [self.batch_size, 1]), tf.int32, name="predict") self.target_label = tf.cast( tf.reshape(self.targets, [self.batch_size, 1]), tf.int32) self.correct = tf.equal(self.predict, self.target_label) self.accuracy = tf.reduce_mean(tf.cast(self.correct, tf.float32), name="accuracy") self.global_step = tf.Variable(0, name="global_step", trainable=False) self.decay_rate = tf.maximum(0.00007, tf.train.exponential_decay( self.lr, self.global_step, 1000, 0.9, staircase=True), name="decay_rate") self.opt = tf.train.AdamOptimizer( learning_rate=self.decay_rate) self.grads_and_vars = self.opt.compute_gradients( self.loss, self.train_vars) self.grads_and_vars = [(tf.clip_by_norm(g, 0.5), v) for g, v in self.grads_and_vars] self.grads_and_vars = [ (tf.add(g, tf.random_normal(tf.shape(g), stddev=0.001)), v) for g, v in self.grads_and_vars ] self.train_op = self.opt.apply_gradients( self.grads_and_vars, global_step=self.global_step, name="train_op") # Summaries for loss and lr self.loss_summary = tf.summary.scalar("loss", self.loss) self.accuracy_summary = tf.summary.scalar("accuracy", self.accuracy) self.lr_summary = tf.summary.scalar("lr", self.decay_rate) # Output directory for models and summaries timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M") self.out_dir = os.path.abspath( os.path.join("./model/rnn_self_att", timestamp)) print("LOGDIR = %s" % self.out_dir) print() # Train Summaries self.train_summary_op = tf.summary.merge( [self.loss_summary, self.accuracy_summary, self.lr_summary]) self.train_summary_dir = os.path.join(self.out_dir, "summary", "train") self.train_summary_writer = tf.summary.FileWriter( self.train_summary_dir, self.sess.graph) # Test summaries self.test_summary_op = tf.summary.merge( [self.loss_summary, self.accuracy_summary, self.lr_summary]) self.test_summary_dir = os.path.join(self.out_dir, "summary", "test") self.test_summary_writer = tf.summary.FileWriter( self.test_summary_dir, self.sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it self.checkpoint_dir = os.path.abspath( os.path.join(self.out_dir, "checkpoints")) self.checkpoint_prefix = os.path.join(self.checkpoint_dir, "model-step") if self.makedir: if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=None)
target_ph = tf.placeholder(tf.float32, [None], name='target_ph') seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph') keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph') # Embedding layer with tf.name_scope('Embedding_layer'): embeddings_var = tf.Variable(tf.random_uniform( [vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True) tf.summary.histogram('embeddings_var', embeddings_var) batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph) # (Bi-)RNN layer(-s) rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) tf.summary.histogram('RNN_outputs', rnn_outputs) # Attention layer with tf.name_scope('Attention_layer'): attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True) tf.summary.histogram('alphas', alphas) # Dropout drop = tf.nn.dropout(attention_output, keep_prob_ph) # Fully connected layer
X_test = fit_in_vocabulary(X_test, vocabulary_size) X_train = zero_pad(X_train, SEQUENCE_LENGTH) X_test = zero_pad(X_test, SEQUENCE_LENGTH) # Different placeholders batch_ph = tf.placeholder(tf.int32, [None, SEQUENCE_LENGTH]) target_ph = tf.placeholder(tf.float32, [None]) seq_len_ph = tf.placeholder(tf.int32, [None]) keep_prob_ph = tf.placeholder(tf.float32) # Embedding layer embeddings_var = tf.Variable(tf.random_uniform([vocabulary_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True) batch_embedded = tf.nn.embedding_lookup(embeddings_var, batch_ph) # (Bi-)RNN layer(-s) rnn_outputs, _ = bi_rnn(GRUCell(HIDDEN_SIZE), GRUCell(HIDDEN_SIZE), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) # rnn_outputs, _ = rnn(GRUCell(hidden_size), inputs=batch_embedded, sequence_length=seq_len_ph, dtype=tf.float32) # Attention layer attention_output, alphas = attention(rnn_outputs, ATTENTION_SIZE, return_alphas=True) # Dropout drop = tf.nn.dropout(attention_output, keep_prob_ph) # Fully connected layer W = tf.Variable(tf.truncated_normal([drop.get_shape()[1].value, 1], stddev=0.1)) b = tf.Variable(tf.constant(0., shape=[1])) y_hat = tf.nn.xw_plus_b(drop, W, b) y_hat = tf.squeeze(y_hat) # Cross-entropy loss and optimizer initialization