def _encoder_decoder_attention(self, q, k, v, bias): with tf.variable_scope("encoder-decoder-attention"): attention = Attention(num_heads=self.num_heads, mode="encoder-decoder-attention", linear_key_dim=self.linear_key_dim, linear_value_dim=self.linear_value_dim, model_dim=self.model_dim, dropout=self.dropout) return attention.multi_head(q, k, v, bias)
def _self_attention(self, q, k, v, seq_len): with tf.variable_scope("self-attention"): attention = Attention(num_heads=self.num_heads, mode="encoder", linear_key_dim=self.linear_key_dim, linear_value_dim=self.linear_value_dim, model_dim=self.model_dim, dropout=self.dropout) return attention.multi_head(q, k, v, seq_len)
def _masked_self_attention(self, q, k, v, bias): with tf.variable_scope("masked-self-attention"): attention = Attention(num_heads=self.num_heads, mode="masked-self-attention", linear_key_dim=self.linear_key_dim, linear_value_dim=self.linear_value_dim, model_dim=self.model_dim, dropout=self.dropout) return attention.multi_head(q, k, v, bias)
def _self_attention(self, q, k, v, future, sos, seq_len): with tf.variable_scope("self-attention"): attention = Attention(num_heads=self.num_heads, masked=True, linear_key_dim=self.linear_key_dim, linear_value_dim=self.linear_value_dim, model_dim=self.model_dim, dropout=self.dropout, batch_size=self.batch_size) return attention.multi_head(q, k, v, future, sos, seq_len)
def define_layer(self): # Different placeholders with tf.name_scope('Input'): self.batch_ph = tf.placeholder(tf.int32,[None,self.sequence_length],name='batch_ph') self.target_ph = tf.placeholder(tf.float32,[None],name='target_ph') self.keep_prob_ph = tf.placeholder(tf.float32, name='keep_prob_ph') # Embedding layer with tf.name_scope('Embedding_layer'): self.embeddings_var = tf.Variable(tf.random_uniform([self.vocab_size,self.embedding_dim],-1.0,1.0),trainable=True) tf.summary.histogram('embeddings_var',self.embeddings_var) self.batch_embedded = tf.nn.embedding_lookup(self.embeddings_var,self.batch_ph) print('self.batch_embedded:',self.batch_embedded.shape) # shape [?,self.sequence_length,self.embedding_dim] # Positional encoding with tf.name_scope('positional_encoding'): positional_encoded = self.positional_encoding(self.model_dim, self.sequence_length, dtype = tf.float32) position_embedded = tf.nn.embedding_lookup(positional_encoded,self.batch_ph) print('position_embedded.shape:',position_embedded.shape) encoded_inputs = tf.add(self.batch_embedded,position_embedded) encoder_emb_inp = tf.nn.dropout(encoded_inputs,1.0-self.dropout) # self-attention with tf.name_scope('self-attention'): o1 = tf.identity(encoder_emb_inp) attention_ = Attention(num_heads=self.num_head, masked=False, linear_key_dim=self.linear_key_dim, linear_value_dim=self.vocab_size, model_dim=self.model_dim, dropout = self.dropout) multi_head_output = attention_.multi_head(o1,o1,o1) o2 = tf.contrib.layers.layer_norm(tf.add(o1, multi_head_output)) # ffn = FFN(w1_dim=self.model_dim, w2_dim=self.model_dim, dropout=self.dropout) o1 = tf.identity(o2) # multi-layers # with tf.name_scope('multi-layers'): # for i in range(1,self.num_layers+1): # with tf.variable_scope(f'layer-{i}'): # o2 = tf.contrib.layers.layer_norm(tf.add(o1,multi_head_output)) # ffn = FFN(w1_dim=self.model_dim,w2_dim=self.model_dim,dropout=self.dropout) # o21 = ffn.dense_relu_dense(o2) # o3 = tf.contrib.layers.layer_norm(tf.add(o2,o21)) # o1 = tf.identity(o3) o4 = tf.reduce_sum(o1, axis=2) print('o1.shape:',o4.shape) # add_layer with tf.name_scope('fully_layer'): inputsize = int(o4.shape[-1]) w = tf.Variable(tf.random_normal([inputsize,1],-0.05,0.05)) f = tf.matmul(o4,w) y_hat = tf.squeeze(f) with tf.name_scope('Metrics'): # Cross-entropy loss and optimizer initialization loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_hat, labels=self.target_ph)) tf.summary.scalar('loss', loss) optimizer = tf.train.AdamOptimizer(learning_rate=1e-3, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-08, use_locking = True).minimize(loss) accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(y_hat)), self.target_ph), tf.float32)) tf.summary.scalar('accuracy', accuracy) y_hat_ = tf.round(tf.sigmoid(y_hat)) merged = tf.summary.merge_all() # 合并summary operation,运行初始化变量 # Batch genetators train_batch_generator = self.batch_generator(self.X_train, self.y_train, self.batch_size) test_batch_generator = self.batch_generator(self.X_test, self.y_test, self.batch_size) session_conf = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) saver = tf.train.Saver() train_writer = tf.summary.FileWriter('./logdir/train', accuracy.graph) test_writer = tf.summary.FileWriter('./logdir/test', accuracy.graph) with tf.Session(config=session_conf) as sess: sess.run(tf.global_variables_initializer()) print('Start learning...') fp = open('./result.txt', 'w', encoding='utf8') for epoch in range(self.numb_epoch): fp.write(str(epoch) + '\n') print('epoch:%d' % epoch) loss_train = 0 loss_test = 0 accuracy_train = 0 accuracy_test = 0 print('epoch:{}\t'.format(epoch), end="") # Training num_batches = self.X_train.shape[0] // self.batch_size train_yT, train_pred, alphas_qq, alphas_qq1 = [], [], [], [] for b in tqdm(range(num_batches)): x_batch, y_batch, indice = next(train_batch_generator) loss_tr, acc, _, summary, y_pred = sess.run([loss, accuracy, optimizer, merged, y_hat_], feed_dict={self.batch_ph: x_batch, self.target_ph: y_batch, self.keep_prob_ph: self.keep_prob}) train_yT.extend(y_batch.tolist()) train_pred.extend(y_pred.tolist()) accuracy_train += acc loss_train = loss_tr * self.delta + loss_train * (1 - self.delta) train_writer.add_summary(summary, b + num_batches * epoch) accuracy_train /= num_batches precision_train = metrics.precision_score(train_yT, train_pred, average='macro') recall_train = metrics.recall_score(train_yT, train_pred) f1_train = metrics.f1_score(train_yT, train_pred) print('loss:{:.4f},acc:{:.4f},precision:{:.4f},recall:{:.4f},f1_score:{:.4f}'.format( loss_train, accuracy_train, precision_train, recall_train, f1_train)) fp.write('train_loss:' + str(loss_train) + ' ' + 'train_acc:' + str( accuracy_train) + ' ' + 'train_precision:' + str(precision_train) + ' ' + 'train_recall:' + str(recall_train) + ' ' + 'train_f1_score:' + str(f1_train) + '\n') # Testing test_yT, test_pred = [], [] num_batches = self.X_test.shape[0] // self.batch_size for b in tqdm(range(num_batches)): x_batch, y_batch, indice = next(test_batch_generator) loss_test_batch, acc, summary, y_pred = sess.run([loss, accuracy, merged, y_hat_], feed_dict={self.batch_ph: x_batch, self.target_ph: y_batch, self.keep_prob_ph: 1.0}) test_yT.extend(y_batch.tolist()) test_pred.extend(y_pred.tolist()) accuracy_test += acc loss_test += loss_test_batch test_writer.add_summary(summary, b + num_batches * epoch) accuracy_test /= num_batches loss_test /= num_batches precision_test = metrics.precision_score(test_yT, test_pred) recall_test = metrics.recall_score(test_yT, test_pred) f1_test = metrics.f1_score(test_yT, test_pred) print( 'loss_test:{:.4f},accuracy_test:{:.4f},precision_test:{:.4f},recall_test:{:.4f},f1_score_test:{:.4f}'.format( loss_test, accuracy_test, precision_test, recall_test, f1_test)) fp.write('test_loss:' + str(loss_test) + ' ' + 'test_acc:' + str( accuracy_test) + ' ' + 'test_precision:' + str(precision_test) + ' ' + 'test_recall:' + str(recall_test) + ' ' + 'test_f1_score:' + str(f1_test) + '\n') saver.save(sess, self.model_path + str(epoch)) train_writer.close() test_writer.close() fp.close() print("Run 'tensorboard --logdir=./logdir' to checkout tensorboard logs.")
def ocr_test(self, vocab_size, max_seq=25, batch_size=8): """ Builds the graph, returns the "important" ops """ inputs = tf.placeholder(tf.float32, [None, 128, 400, 3]) output = tf.placeholder(tf.int32, [None, max_seq]) length = tf.placeholder(tf.int32, [None]) resnet_34 = ResNet(34, 10) def resnet_34_backbone(x): out = resnet_34.network(x, is_training=False) print(out) return out feature_map_resnet = resnet_34_backbone( inputs) #feature map of resnet 34 feature_map = transform_dimension(feature_map_resnet, 1024) # print("feature map: ", feature_map) for i in range(6): global_representation = bottle_resblock( feature_map_resnet if i == 0 else global_representation, 512, is_training=False, scope='bottle_resblock_' + str(i)) global_representation = global_avg_pooling(global_representation) global_representation = fully_conneted(global_representation, 512) ##########################################################DECODER######################################## def decoder_embedding(y, vocab_size, embed_size=512, shifted=True): embeddings = tf.random_normal(shape=(vocab_size, embed_size)) embedded = tf.nn.embedding_lookup(embeddings, y) return embedded def positional_encoding(x): """ Not as described in paper since it lacked proper description of this step. This function is based on the "Attention is all you need" paper. """ seq_len, dim = x.get_shape().as_list()[-2:] encoded_vec = np.array([ pos / np.power(10000, 2 * i / dim) for pos in range(seq_len) for i in range(dim) ]) encoded_vec[::2] = np.sin(encoded_vec[::2]) encoded_vec[1::2] = np.cos(encoded_vec[1::2]) encoded_vec_tensor = tf.convert_to_tensor(encoded_vec.reshape( [seq_len, dim]), dtype=tf.float32) return tf.add(x, encoded_vec_tensor) def layer_norm(x): """ Layer normalization as described in paper (p.4) """ return tf.contrib.layers.layer_norm(x) y = decoder_embedding(output, vocab_size) y = tf.pad(y, [[0, 0], [1, 0], [0, 0] ])[:, :-1, :] #shift right from official transformer # print("embedding: ", y) y = positional_encoding(y) # print("PE: ", y) #(bs, seq_len, 512) #concatenate with global representation decoder_input = [] for i in range(y.get_shape().as_list()[1]): decoder_input.append( tf.concat([global_representation, y[:, i, :]], 1)) #(bs, 1, 512) decoder_input = tf.stack(decoder_input, 1) ####MASKED SELF ATTENTION### masked_self_attention = Attention(dropout=0) decoder_output = masked_self_attention.multi_head( decoder_input, decoder_input, decoder_input) norm_1 = layer_norm(decoder_output) decoder_output = decoder_input + norm_1 ###2D self attention### two_D_attention = Attention(masked=False, dropout=0) rrr = feature_map.get_shape().as_list()[1] * feature_map.get_shape( ).as_list()[2] enc_reshape = tf.reshape( feature_map, [-1, rrr, decoder_output.get_shape().as_list()[-1]]) decoder_output_2 = two_D_attention.multi_head(decoder_output, enc_reshape, enc_reshape) norm_2 = layer_norm(decoder_output_2) decoder_output = decoder_output + norm_2 def position_wise_feed_forward_network(x): """ Position-wise Feed-Forward Network as described in paper (p.4) """ # First linear #linear_1 = tf.layers.dense(x, x.get_shape().as_list()[-1]) linear_1 = tf.layers.conv1d(x, 2048, 1) # ReLU operation relu_1 = tf.nn.relu(linear_1) # Second linear linear_2 = tf.layers.conv1d(relu_1, x.get_shape().as_list()[-1], 1) return tf.nn.dropout(linear_2, 1) pwff = position_wise_feed_forward_network(decoder_output) norm_3 = layer_norm(pwff) decoder_output = decoder_output + norm_3 # output_probabilities = tf.layers.dense(decoder_output, vocab_size, activation=tf.contrib.layers.softmax) output_probabilities = tf.layers.dense(decoder_output, vocab_size) ids, log_probs, scores = self.char_predictions(output_probabilities, vocab_size, max_seq) probs = tf.nn.softmax(output_probabilities) init = tf.global_variables_initializer() return inputs, output, length, scores, ids, probs, init
def convolutional_attention_network(self, vocab_size, max_seq=25, batch_size=8): """ Builds the graph """ inputs = tf.placeholder(tf.float32, [batch_size, 128, 400, 3]) output = tf.placeholder(tf.int32, [batch_size, max_seq]) length = tf.placeholder(tf.int32, [batch_size]) resnet_34 = ResNet(34, 10) def resnet_34_backbone(x): out = resnet_34.network(x) print(out) return out feature_map_resnet = resnet_34_backbone( inputs) #feature map of resnet 34 feature_map = transform_dimension(feature_map_resnet, 1024) for i in range(6): global_representation = bottle_resblock( feature_map_resnet if i == 0 else global_representation, 512, scope='bottle_resblock_' + str(i)) global_representation = global_avg_pooling(global_representation) global_representation = fully_conneted(global_representation, 512) ##########################################################DECODER######################################## def decoder_embedding(y, vocab_size, embed_size=512, shifted=True): embeddings = tf.random_normal(shape=(vocab_size, embed_size)) embedded = tf.nn.embedding_lookup(embeddings, y) return embedded def positional_encoding(x): seq_len, dim = x.get_shape().as_list()[-2:] encoded_vec = np.array([ pos / np.power(10000, 2 * i / dim) for pos in range(seq_len) for i in range(dim) ]) encoded_vec[::2] = np.sin(encoded_vec[::2]) encoded_vec[1::2] = np.cos(encoded_vec[1::2]) encoded_vec_tensor = tf.convert_to_tensor(encoded_vec.reshape( [seq_len, dim]), dtype=tf.float32) return tf.add(x, encoded_vec_tensor) def layer_norm(x): return tf.contrib.layers.layer_norm(x) y = decoder_embedding(output, vocab_size) y = tf.pad(y, [[0, 0], [1, 0], [0, 0] ])[:, :-1, :] #shift right from official transformer y = positional_encoding(y) #(bs, seq_len, 512) #concatenate with global representation decoder_input = [] for i in range(y.get_shape().as_list()[1]): decoder_input.append( tf.concat([global_representation, y[:, i, :]], 1)) #(bs, 1, 512) decoder_input = tf.stack(decoder_input, 1) #(bs, seq_len, 1024) ####MASKED SELF ATTENTION### masked_self_attention = Attention(dropout=0) decoder_output = masked_self_attention.multi_head( decoder_input, decoder_input, decoder_input) norm_1 = layer_norm(decoder_output) decoder_output = decoder_input + norm_1 ###2D self attention### two_D_attention = Attention(masked=False, dropout=0) enc_reshape = tf.reshape(feature_map, [ decoder_output.get_shape().as_list()[0], -1, decoder_output.get_shape().as_list()[-1] ]) decoder_output_2 = two_D_attention.multi_head(decoder_output, enc_reshape, enc_reshape) norm_2 = layer_norm(decoder_output_2) decoder_output = decoder_output + norm_2 def position_wise_feed_forward_network(x): #using conv1D # First linear linear_1 = tf.layers.conv1d(x, 2048, 1) # ReLU operation relu_1 = tf.nn.relu(linear_1) # Second linear linear_2 = tf.layers.conv1d(relu_1, x.get_shape().as_list()[-1], 1) return tf.nn.dropout(linear_2, 1) pwff = position_wise_feed_forward_network(decoder_output) norm_3 = layer_norm(pwff) decoder_output = decoder_output + norm_3 output_probabilities = tf.layers.dense(decoder_output, vocab_size) loss = self._compute_loss(output_probabilities, output, length, batch_size) ids, log_probs, scores = self.char_predictions(output_probabilities, vocab_size, max_seq) char_acc = char_accuracy(ids, output, 0) word_acc = sequence_accuracy(ids, output, 0) with tf.name_scope('summaries'): tf.summary.scalar("loss", loss, collections=["train_summary"]) tf.summary.scalar("character accuracy", char_acc, collections=["train_summary"]) tf.summary.scalar("word accuracy", word_acc, collections=["train_summary"]) summary_op = tf.summary.merge_all(key='train_summary') optimizer = tf.train.AdadeltaOptimizer(learning_rate=1).minimize(loss) init = tf.global_variables_initializer() return inputs, output, length, loss, optimizer, output_probabilities, summary_op, init, word_acc