def train(argv=None): # load data print("Loading data ... ") x_train, y_train = dependency_load_data.load_train_data() x_test, y_test = dependency_load_data.load_test_data() # concatenate and shuffle . x_sum = numpy.concatenate((x_train, x_test)) y_sum = numpy.concatenate((y_train, y_test)) numpy.random.seed(10) shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum))) x_shuffled = x_sum[shuffle_indices] y_shuffled = y_sum[shuffle_indices] # split to train and test . x_train = x_shuffled[1000:] y_train = y_shuffled[1000:] x_test = x_shuffled[:1000] y_test = y_shuffled[:1000] print(x_train.shape) print(x_test.shape) # expand (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE) to (batch_size,MAX_SENTENCE_LENGTH,EMBEDDING_SIZE,1) x_train = numpy.expand_dims(x_train, -1) x_test = numpy.expand_dims(x_test, -1) filter_sizes = [2, 3, 4, 5, 6] filter_numbers = [300, 200, 150, 100, 100] # input # input is sentence train_data_node = tf.placeholder(tf.float32, shape=(None, max_document_length, EMBEDDING_SIZE, NUM_CHANNELS)) train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES)) dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # full connected - softmax layer, fc1_weights = tf.Variable( tf.truncated_normal([sum(filter_numbers), NUM_CLASSES], stddev=0.1, seed=SEED, dtype=tf.float32)) fc1_biases = tf.Variable( tf.constant(0.01, shape=[NUM_CLASSES], dtype=tf.float32)) # model def model(data): pooled_outputs = [] for idx, filter_size in enumerate(filter_sizes): conv = conv2d(train_data_node, filter_numbers[idx], filter_size, EMBEDDING_SIZE, name="kernel%d" % idx) # 1-max pooling,leave a tensor of shape[batch_size,1,1,num_filters] pool = tf.nn.max_pool( conv, ksize=[1, max_document_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID') pooled_outputs.append(tf.squeeze(pool)) if len(filter_sizes) > 1: cnn_output = tf.concat(1, pooled_outputs) else: cnn_output = pooled_outputs[0] # add dropout reshape = tf.nn.dropout(cnn_output, dropout_keep_prob) # fc1 layer fc1_output = tf.matmul(reshape, fc1_weights) + fc1_biases return fc1_output # Training computation logits = model(train_data_node) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node)) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases)) loss += 0.1 * regularizers tf.scalar_summary('loss', loss) # optimizer global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = tf.Variable(start_learning_rate, name="learning_rate") # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step*BATCH_SIZE,train_size,0.9,staircase=True) tf.scalar_summary('lr', learning_rate) optimizer = tf.train.AdamOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Evaluate model train_predict = tf.argmax(logits, 1) train_label = tf.argmax(train_labels_node, 1) # train accuracy train_correct_pred = tf.equal(train_predict, train_label) train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32)) tf.scalar_summary('acc', train_accuracy) merged = tf.merge_all_summaries() def compute_index(y_label, y_predict): # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "macro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='macro'), f1_score(y_label, y_predict, average='macro'))) # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "micro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='micro'), f1_score(y_label, y_predict, average='micro'))) # weighted print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "weighted", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='weighted'), f1_score(y_label, y_predict, average='weighted'))) def dev_step(x_batch, y_batch, best_test_loss, sess): feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 1.0 } # Run the graph and fetch some of the nodes. # test dont apply train_op (train_op is update gradient). summary, step, losses, lr, acc, y_label, y_predict = sess.run( [ merged, global_step, loss, learning_rate, train_accuracy, train_label, train_predict ], feed_dict=feed_dict) test_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format( time_str, step, losses, lr, acc)) # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc)) # compute index compute_index(y_label, y_predict) new_best_test_loss = best_test_loss # decide if need to decay learning rate if (step % steps_each_check < 100) and (step > 100): loss_delta = (best_test_loss if best_test_loss is not None else 0) - losses if best_test_loss is not None and loss_delta < decay_delta: print( 'validation loss did not improve enough, decay learning rate' ) current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay if current_learning_rate == min_learning_rate: print('It is already the smallest learning rate.') sess.run(learning_rate.assign(current_learning_rate)) print('new learning rate is: ', current_learning_rate) else: # update new_best_test_loss = losses return new_best_test_loss # run the training with tf.Session() as sess: train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph) test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test') tf.initialize_all_variables().run() print('Initialized!') # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), BATCH_SIZE, NUM_EPOCHS) # batch count batch_count = 0 best_test_loss = None # Training loop.For each batch... for batch in batches: batch_count += 1 if batch_count % EVAL_FREQUENCY == 0: print("\nEvaluation:") best_test_loss = dev_step(x_test, y_test, best_test_loss, sess) print("") else: if batch_count % META_FREQUENCY == 99: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.5 } # Run the graph and fetch some of the nodes. # option run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % step) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g},acc {:g}".format( time_str, step, losses, acc)) else: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.5 } # Run the graph and fetch some of the nodes. _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, losses, acc)) train_writer.close() test_writer.close()
def train(argv=None): # load data print("Loading data ... ") x_train, y_train = dependency_load_data.load_train_data() x_test, y_test = dependency_load_data.load_test_data() # concatenate and shuffle . x_sum = numpy.concatenate((x_train, x_test)) y_sum = numpy.concatenate((y_train, y_test)) numpy.random.seed(10) shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum))) x_shuffled = x_sum[shuffle_indices] y_shuffled = y_sum[shuffle_indices] # split to train and test . x_train = x_shuffled[1000:] y_train = y_shuffled[1000:] x_test = x_shuffled[:1000] y_test = y_shuffled[:1000] print(x_train.shape) print(x_test.shape) # input is sentence # [n,embed] train_data_node = tf.placeholder(tf.float32, shape=(max_document_length, EMBEDDING_SIZE)) # [num_class] train_labels_node = tf.placeholder(tf.float32, shape=(NUM_CLASSES, )) dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # convolution weight wf_weights = tf.Variable( tf.truncated_normal([d_c, EMBEDDING_SIZE], stddev=0.1, seed=SEED, dtype=tf.float32)) wf_biases = tf.Variable( tf.constant(0.01, shape=[max_document_length], dtype=tf.float32)) # attention matrix u_weights = tf.Variable( tf.truncated_normal([d_c, d_c], stddev=0.1, seed=SEED, dtype=tf.float32)) # class embeddings matrix classes_matrix = tf.Variable( tf.truncated_normal([d_c, NUM_CLASSES], stddev=0.1, seed=SEED, dtype=tf.float32)) # model # data = [max_document_length,EMBEDDING_SIZE] def model(data): # R = [d_c,n] R = tf.matmul(wf_weights, data, transpose_b=True) # convolution_output = [d_c,n] convolution_output = tf.tanh(tf.nn.bias_add(R, wf_biases)) # attention G_part = tf.matmul(tf.transpose(convolution_output), u_weights) # correlation_matrix = [n,num_class] correlation_matrix = tf.matmul(G_part, classes_matrix) # apply softmax to get attention pooling matrix # attention_pool = [n,num_class] attention_pool = tf.nn.softmax(correlation_matrix, dim=0) # compute output # W = [d_c , num_class] W = tf.matmul(convolution_output, attention_pool) # output = [d_c] output = tf.reduce_max(W, reduction_indices=-1) return output # score all classes # w_o = [d_c] # classes_embeddings = [num_class,d_c] def score_classes(w_o, classes_embeddings): # classes_embeddings normalized normalized_classes_embeddings = tf.nn.l2_normalize(classes_embeddings, dim=-1) all_class_embeddings = [ tf.squeeze(one) for one in tf.split(0, NUM_CLASSES, normalized_classes_embeddings) ] scores = [] normalized_w_o = tf.nn.l2_normalize(w_o, dim=-1) for class_embedding in all_class_embeddings: scores.append(tf.nn.l2_loss(normalized_w_o - class_embedding)) # transform to tensor scores = tf.pack(scores) return scores # label = [num_class],int # scores = [num_class],float # neg score is the lowest score expect true score def get_predict_neg_score(scores, label): # dot product ground_index = tf.argmax(label, axis=0) ground_score = tf.reduce_sum(tf.mul(scores, tf.cast(label, tf.float32))) # neg is the maximum of the remaining values reversed_scores = tf.negative(scores) top_values, top_indices = tf.nn.top_k(reversed_scores, k=2) true_flag = tf.nn.in_top_k(tf.expand_dims(reversed_scores, 0), tf.expand_dims(ground_index, 0), 1) top_1_index = tf.cast(true_flag, tf.int32) chosen_score = tf.negative( tf.squeeze(tf.gather(top_values, top_1_index))) return ground_score, chosen_score def get_true_predict_indice(scores, label): true_indices = tf.argmax(label, axis=0) top_value, top_indices = tf.nn.top_k(tf.negative(scores), k=1) predict_indices = tf.squeeze(tf.pack(top_indices)) return true_indices, predict_indices # Training computation w_o = model(train_data_node) scores = score_classes(w_o, tf.transpose(classes_matrix)) true_score, neg_score = get_predict_neg_score(scores, train_labels_node) true_index, predict_index = get_true_predict_indice( scores, train_labels_node) # loss loss = true_score + 1 - neg_score # L2 regularization for the fully connected parameters. regularizers = tf.nn.l2_loss(wf_weights) + tf.nn.l2_loss( wf_biases) + tf.nn.l2_loss(u_weights) + tf.nn.l2_loss(classes_matrix) loss += 0.01 * regularizers tf.scalar_summary('loss', loss) # optimizer global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = tf.Variable(start_learning_rate, name="learning_rate") # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step*BATCH_SIZE,train_size,0.9,staircase=True) tf.scalar_summary('lr', learning_rate) # optimizer = tf.train.AdamOptimizer(learning_rate) optimizer = tf.train.GradientDescentOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Evaluate model , 0 is wrong, 1 is right # train_is_correct = tf.cast(tf.equal(true_index,predict_index),tf.float32) merged = tf.merge_all_summaries() def compute_index(y_label, y_predict): # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "macro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='macro'), f1_score(y_label, y_predict, average='macro'))) # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "micro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='micro'), f1_score(y_label, y_predict, average='micro'))) # weighted print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "weighted", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='weighted'), f1_score(y_label, y_predict, average='weighted'))) def dev_step(x_batch, y_batch, best_test_loss, sess): test_size = len(x_batch) true_label = [] predict_label = [] test_loss = [] current_step = 0 current_lr = 0 for i in range(test_size): one_feed_dict = { train_data_node: x_batch[i], train_labels_node: y_batch[i], dropout_keep_prob: 1.0 } # Run the graph and fetch some of the nodes. # test dont apply train_op (train_op is update gradient). test_step, lr, result_loss, result_true, result_predict = sess.run( [global_step, learning_rate, loss, true_index, predict_index], feed_dict=one_feed_dict) true_label.append(result_true) predict_label.append(result_predict) test_loss.append(result_loss) # test_writer.add_summary(test_summary, test_step) current_step = test_step current_lr = lr # compute average loss average_loss = numpy.mean(test_loss) test_time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, lr {:g} ".format( test_time_str, current_step, average_loss, current_lr)) # compute index compute_index(true_label, predict_label) new_best_test_loss = best_test_loss # decide if need to decay learning rate if (test_step % steps_each_check < 100) and (test_step > 100): loss_delta = (best_test_loss if best_test_loss is not None else 0) - average_loss if best_test_loss is not None and loss_delta < decay_delta: print( 'validation loss did not improve enough, decay learning rate' ) current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay if current_learning_rate == min_learning_rate: print('It is already the smallest learning rate.') sess.run(learning_rate.assign(current_learning_rate)) print('new learning rate is: ', current_learning_rate) else: # update new_best_test_loss = average_loss return new_best_test_loss # run the training with tf.Session() as sess: # train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train',sess.graph) # test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test') tf.initialize_all_variables().run() print('Initialized!') # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), BATCH_SIZE, NUM_EPOCHS) # batch count batch_count = 0 best_test_loss = None # Training loop.For each batch... for batch in batches: batch_count += 1 if batch_count % EVAL_FREQUENCY == 0: print("\nEvaluation:") best_test_loss = dev_step(x_test, y_test, best_test_loss, sess) print("") else: if batch_count % META_FREQUENCY == 99: x_batch, y_batch = zip(*batch) train_size = len(x_batch) true_label = [] predict_label = [] train_loss = [] # Run the graph and fetch some of the nodes. # option run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() current_step = 0 for i in range(train_size): feed_dict = { train_data_node: x_batch[i], train_labels_node: y_batch[i], dropout_keep_prob: 0.5 } _, step, result_loss, result_true, result_predict = sess.run( [ train_op, global_step, loss, true_index, predict_index ], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) true_label.append(result_true) predict_label.append(result_predict) train_loss.append(result_loss) current_step = step # train_writer.add_run_metadata(run_metadata, 'step%03d' % step) # train_writer.add_summary(summary, step) # compute average loss average_loss = numpy.mean(train_loss) acc = accuracy_score(true_label, predict_label) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g} acc {:g}".format( time_str, current_step, average_loss, acc)) else: x_batch, y_batch = zip(*batch) train_size = len(x_batch) true_label = [] predict_label = [] train_loss = [] current_step = 0 for i in range(train_size): feed_dict = { train_data_node: x_batch[i], train_labels_node: y_batch[i], dropout_keep_prob: 0.5 } _, step, result_loss, result_true, result_predict = sess.run( [ train_op, global_step, loss, true_index, predict_index ], feed_dict=feed_dict) true_label.append(result_true) predict_label.append(result_predict) train_loss.append(result_loss) # train_writer.add_summary(summary, step) current_step = step average_loss = numpy.mean(train_loss) acc = accuracy_score(true_label, predict_label) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g} acc {:g}".format( time_str, current_step, average_loss, acc))
def train(argv=None): # load data print("Loading data ... ") x_train, y_train = dependency_load_data.load_train_data() x_test, y_test = dependency_load_data.load_test_data() # concatenate and shuffle . x_sum = numpy.concatenate((x_train, x_test)) y_sum = numpy.concatenate((y_train, y_test)) numpy.random.seed(10) shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum))) x_shuffled = x_sum[shuffle_indices] y_shuffled = y_sum[shuffle_indices] # split to train and test . # x=[N_Samples,max_document_length,EMBEDDING_SIZE] # y=[N_Samples,NUM_CLASSES] x_train = x_shuffled[Test_Size:] y_train = y_shuffled[Test_Size:] x_test = x_shuffled[:Test_Size] y_test = y_shuffled[:Test_Size] print(x_train.shape) print(x_test.shape) print("exception words : " + str(dependency_load_data.get_exception_number())) # 500 steps_each_check = 500 # input # input is sentence train_data_node = tf.placeholder(tf.float32, shape=(None, NUM_STEPS, EMBEDDING_SIZE)) train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES)) dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") is_training = tf.placeholder(tf.bool, name="is_training") # CNN filter_sizes = [2, 3, 4, 5, 6] filter_numbers = [300, 200, 150, 100, 100] # full connected - softmax layer, fc1_weights = tf.Variable( tf.truncated_normal([sum(filter_numbers), 100], stddev=0.1, seed=SEED, dtype=tf.float32)) fc1_biases = tf.Variable(tf.constant(0.01, shape=[100], dtype=tf.float32)) fc2_weights = tf.Variable( tf.truncated_normal([100, NUM_CLASSES], stddev=0.1, seed=SEED, dtype=tf.float32)) fc2_biases = tf.Variable( tf.constant(0.01, shape=[NUM_CLASSES], dtype=tf.float32)) # model def model(x): # Current data input shape: (batch_size, n_steps, n_input) x = tf.transpose(x, [1, 0, 2]) # (n_steps*batch_size, n_input) x = tf.reshape(x, [-1, EMBEDDING_SIZE]) # get a list of 'n_steps' tensors of shape (batch_size, n_input) x = tf.split(0, NUM_STEPS, x) # B-directional LSTM with tf.variable_scope("fw_cell"): fw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, forget_bias=1.0, state_is_tuple=True) # fw_cell = tf.nn.rnn_cell.DropoutWrapper(fw_cell, output_keep_prob=dropout_keep_prob) if rnn_layer > 1: fw_cell = tf.nn.rnn_cell.MultiRNNCell([fw_cell] * rnn_layer) with tf.variable_scope("bw_cell"): bw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, forget_bias=1.0, state_is_tuple=True) # bw_cell = tf.nn.rnn_cell.DropoutWrapper(bw_cell, output_keep_prob=dropout_keep_prob) if rnn_layer > 1: bw_cell = tf.nn.rnn_cell.MultiRNNCell([bw_cell] * rnn_layer) # output = [batch_size,num_hidden*2] # outputs of Bi-directional LSTM to highway with tf.variable_scope("rnn_def"): outputs, fw_final_state, bw_final_state = tf.nn.bidirectional_rnn( fw_cell, bw_cell, x, dtype=tf.float32) # Highway # convert to [batch_size,num_steps,num_hidden*2] hw_input = tf.transpose(tf.pack(outputs, axis=0), [1, 0, 2]) # convert to [batch_size x num_steps,num_hidden*2] hw_input = tf.reshape(hw_input, [-1, num_hidden * 2]) size = hw_input.get_shape()[1] # size = num_hidden*2 # tf.tanh # hw_output=[batch_size x num_steps,num_hidden*2] hw_output = highways(hw_input, size) # convert to [batch_size,num_steps,num_hidden*2] hw_output = tf.reshape(hw_output, [-1, NUM_STEPS, num_hidden * 2]) # expand dim , cnn_input=[batch_size,num_steps,num_hidden*2,1] cnn_input = tf.expand_dims(hw_output, -1) # CNN pooled_outputs = [] for idx, filter_size in enumerate(filter_sizes): conv = conv2d(cnn_input, filter_numbers[idx], filter_size, num_hidden * 2, name="kernel%d" % idx) # conv = batch_norm_conv2d(cnn_input,filter_numbers[idx], filter_size,idx,num_hidden*2,is_training,stddev=0.1, name="kernel%d" % idx) # 1-max pooling,leave a tensor of shape[batch_size,1,1,num_filters] pool = tf.nn.max_pool( conv, ksize=[1, max_document_length - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID') pooled_outputs.append(tf.squeeze(pool)) if len(filter_sizes) > 1: cnn_output = tf.concat(1, pooled_outputs) else: cnn_output = pooled_outputs[0] # add dropout cnn_output = tf.nn.dropout(cnn_output, dropout_keep_prob) # fc1 layer hidden = tf.matmul(cnn_output, fc1_weights) # add batch normalization # hidden = official_batch_norm_layer(tf.nn.bias_add(hidden,fc1_biases),100,is_training,False,scope="fc1_batch_norm") fc1_output = tf.sigmoid(tf.nn.bias_add(hidden, fc1_biases)) # softmax linear layer , don't apply activation function hidden = tf.matmul(fc1_output, fc2_weights) fc2_output = tf.nn.bias_add(hidden, fc2_biases) return fc2_output # Training computation # [batch_size,num_classes] logits = model(train_data_node) # add value clip to logits loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node)) regularization = tf.nn.l2_loss(fc1_weights)+tf.nn.l2_loss(fc1_biases)+tf.nn.l2_loss(fc2_weights)\ + tf.nn.l2_loss(fc2_biases) loss += 0.01 * regularization tf.scalar_summary('loss', loss) # optimizer global_step = tf.Variable(0, name="global_step", trainable=False) # learning_rate=tf.train.exponential_decay(start_learning_rate,global_step,5000,0.5,staircase=True) learning_rate = tf.Variable(start_learning_rate, name="learning_rate") tf.scalar_summary('lr', learning_rate) # adamoptimizer optimizer = tf.train.AdamOptimizer(learning_rate) # optimizer = tf.train.GradientDescentOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Evaluate model train_predict = tf.argmax(logits, 1) train_label = tf.argmax(train_labels_node, 1) # train accuracy train_correct_pred = tf.equal(train_predict, train_label) train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32)) tf.scalar_summary('acc', train_accuracy) # all variables # for v in tf.all_variables(): # print(v.name) merged = tf.merge_all_summaries() def compute_index(y_label, y_predict): # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "macro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='macro'), f1_score(y_label, y_predict, average='macro'))) # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "micro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='micro'), f1_score(y_label, y_predict, average='micro'))) # weighted print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "weighted", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='weighted'), f1_score(y_label, y_predict, average='weighted'))) def dev_step(x_batch, y_batch, best_test_loss, sess): feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 1.0, is_training: False } # Run the graph and fetch some of the nodes. # test dont apply train_op (train_op is update gradient). summary, step, losses, lr, acc, y_label, y_predict = sess.run( [ merged, global_step, loss, learning_rate, train_accuracy, train_label, train_predict ], feed_dict=feed_dict) test_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format( time_str, step, losses, lr, acc)) # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc)) # compute index compute_index(y_label, y_predict) new_best_test_loss = best_test_loss # decide if need to decay learning rate if (step % steps_each_check < 100) and (step > 100): loss_delta = (best_test_loss if best_test_loss is not None else 0) - losses if best_test_loss is not None and loss_delta < decay_delta: print( 'validation loss did not improve enough, decay learning rate' ) current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay if current_learning_rate == min_learning_rate: print('It is already the smallest learning rate.') sess.run(learning_rate.assign(current_learning_rate)) print('new learning rate is: ', current_learning_rate) else: # update new_best_test_loss = losses return new_best_test_loss # run the training with tf.Session() as sess: train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph) test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test') tf.initialize_all_variables().run() print('Initialized!') # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), BATCH_SIZE, NUM_EPOCHS) # batch count batch_count = 0 best_test_loss = None # Training loop.For each batch... for batch in batches: batch_count += 1 if batch_count % EVAL_FREQUENCY == 0: print("\nEvaluation:") best_test_loss = dev_step(x_test, y_test, best_test_loss, sess) print("") else: if batch_count % META_FREQUENCY == 99: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.5, is_training: True } # Run the graph and fetch some of the nodes. # option run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % step) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g},acc {:g}".format( time_str, step, losses, acc)) else: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.5, is_training: True } # Run the graph and fetch some of the nodes. _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, losses, acc)) train_writer.close() test_writer.close()
def train(argv=None): # load data print("Loading data ... ") x_train, y_train = dependency_load_data.load_train_data() x_test, y_test = dependency_load_data.load_test_data() # concatenate and shuffle . x_sum = numpy.concatenate((x_train, x_test)) y_sum = numpy.concatenate((y_train, y_test)) numpy.random.seed(10) shuffle_indices = numpy.random.permutation(numpy.arange(len(y_sum))) x_shuffled = x_sum[shuffle_indices] y_shuffled = y_sum[shuffle_indices] # split to train and test . # x=[N_Samples,max_document_length,EMBEDDING_SIZE] # y=[N_Samples,NUM_CLASSES] x_train = x_shuffled[Test_Size:] y_train = y_shuffled[Test_Size:] x_test = x_shuffled[:Test_Size] y_test = y_shuffled[:Test_Size] print(x_train.shape) print(x_test.shape) # input # input is sentence train_data_node = tf.placeholder(tf.float32, shape=(None, NUM_STEPS, EMBEDDING_SIZE)) train_labels_node = tf.placeholder(tf.float32, shape=(None, NUM_CLASSES)) dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") is_training = tf.placeholder(tf.bool, name="is_training") fc1_weights = tf.Variable( tf.random_normal([2 * num_hidden, 200]) # tf.truncated_normal([num_hidden,NUM_CLASSES],stddev=0.1,seed=SEED,dtype=tf.float32) ) fc1_biases = tf.Variable(tf.constant(0.01, shape=[200], dtype=tf.float32)) fc2_weights = tf.Variable( tf.random_normal([200, NUM_CLASSES]) # tf.truncated_normal([num_hidden,NUM_CLASSES],stddev=0.1,seed=SEED,dtype=tf.float32) ) fc2_biases = tf.Variable( tf.constant(0.01, shape=[NUM_CLASSES], dtype=tf.float32)) # model def model(x): # Current data input shape: (batch_size, n_steps, n_input) x = tf.transpose(x, [1, 0, 2]) # (n_steps*batch_size, n_input) x = tf.reshape(x, [-1, EMBEDDING_SIZE]) # get a list of 'n_steps' tensors of shape (batch_size, n_input) x = tf.split(0, NUM_STEPS, x) # B-directional LSTM fw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, forget_bias=1.0, state_is_tuple=True) # add output projection # fw_cell = tf.nn.rnn_cell.OutputProjectionWrapper(fw_cell,output_projection_size) fw_cell = tf.nn.rnn_cell.DropoutWrapper( fw_cell, output_keep_prob=dropout_keep_prob) bw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden, forget_bias=1.0, state_is_tuple=True) # add output projection # bw_cell = tf.nn.rnn_cell.OutputProjectionWrapper(bw_cell,output_projection_size) bw_cell = tf.nn.rnn_cell.DropoutWrapper( bw_cell, output_keep_prob=dropout_keep_prob) if rnn_layer > 1: fw_cell = tf.nn.rnn_cell.MultiRNNCell([fw_cell] * rnn_layer) bw_cell = tf.nn.rnn_cell.MultiRNNCell([bw_cell] * rnn_layer) outputs, fw_final_state, bw_final_state = tf.nn.bidirectional_rnn( fw_cell, bw_cell, x, dtype=tf.float32) # initial_state = lstm_cell.zero_state(batch_size,dtype=tf.float32) # handle all output # output = [batch_size,num_hidden*2] # add all output # merge_ouput = tf.matmul(tf.add_n(outputs), fc1_weights) + fc1_biases # dim-max dim_max = outputs[0] for output in outputs: dim_max = tf.maximum(dim_max, output) # fc1 layer hidden = tf.matmul(dim_max, fc1_weights) + fc1_biases # add batch normalization # hidden = official_batch_norm_layer(hidden,200,is_training,False,scope="fc1_batch_norm") fc1_output = tf.tanh(hidden) # fc2 layer merge_output = tf.matmul(fc1_output, fc2_weights) + fc2_biases # merge_output = [batch_size,num_classes] return merge_output # Training computation # [batch_size,num_classes] logits = model(train_data_node) # add value clip to logits loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( tf.clip_by_value(logits, 1e-10, 1.0), train_labels_node)) # L2 regularization for the fully connected parameters. regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) + tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases)) loss += 0.05 * regularizers tf.scalar_summary('loss', loss) # optimizer global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = tf.Variable(start_learning_rate, name="learning_rate") tf.scalar_summary('lr', learning_rate) # adamoptimizer optimizer = tf.train.AdamOptimizer(learning_rate) # optimizer = tf.train.GradientDescentOptimizer(learning_rate) grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Evaluate model train_predict = tf.argmax(logits, 1) train_label = tf.argmax(train_labels_node, 1) # train accuracy train_correct_pred = tf.equal(train_predict, train_label) train_accuracy = tf.reduce_mean(tf.cast(train_correct_pred, tf.float32)) tf.scalar_summary('acc', train_accuracy) merged = tf.merge_all_summaries() def compute_index(y_label, y_predict): # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "macro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='macro'), f1_score(y_label, y_predict, average='macro'))) # macro print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "micro", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='micro'), f1_score(y_label, y_predict, average='micro'))) # weighted print("{}: acc {:g}, recall {:g}, f1 {:g} ".format( "weighted", accuracy_score(y_label, y_predict), recall_score(y_label, y_predict, average='weighted'), f1_score(y_label, y_predict, average='weighted'))) def dev_step(x_batch, y_batch, best_test_loss, sess): feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 1.0, is_training: False } # Run the graph and fetch some of the nodes. # test dont apply train_op (train_op is update gradient). summary, step, losses, lr, acc, y_label, y_predict = sess.run( [ merged, global_step, loss, learning_rate, train_accuracy, train_label, train_predict ], feed_dict=feed_dict) test_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, lr {:g} ,acc {:g}".format( time_str, step, losses, lr, acc)) # print("{}: step {}, loss {:g} ,acc {:g}".format(time_str, step, losses,acc)) # compute index compute_index(y_label, y_predict) new_best_test_loss = best_test_loss # decide if need to decay learning rate if (step % steps_each_check < 100) and (step > 100): loss_delta = (best_test_loss if best_test_loss is not None else 0) - losses if best_test_loss is not None and loss_delta < decay_delta: print( 'validation loss did not improve enough, decay learning rate' ) current_learning_rate = min_learning_rate if lr * learning_rate_decay < min_learning_rate else lr * learning_rate_decay if current_learning_rate == min_learning_rate: print('It is already the smallest learning rate.') sess.run(learning_rate.assign(current_learning_rate)) print('new learning rate is: ', current_learning_rate) else: # update new_best_test_loss = losses return new_best_test_loss # run the training with tf.Session() as sess: train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph) test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test') tf.initialize_all_variables().run() print('Initialized!') # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), BATCH_SIZE, NUM_EPOCHS) # batch count batch_count = 0 best_test_loss = None # Training loop.For each batch... for batch in batches: batch_count += 1 if batch_count % EVAL_FREQUENCY == 0: print("\nEvaluation:") best_test_loss = dev_step(x_test, y_test, best_test_loss, sess) print("") else: if batch_count % META_FREQUENCY == 99: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.4, is_training: True } # Run the graph and fetch some of the nodes. # option run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % step) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g},acc {:g}".format( time_str, step, losses, acc)) else: x_batch, y_batch = zip(*batch) feed_dict = { train_data_node: x_batch, train_labels_node: y_batch, dropout_keep_prob: 0.4, is_training: True } # Run the graph and fetch some of the nodes. _, summary, step, losses, acc = sess.run( [train_op, merged, global_step, loss, train_accuracy], feed_dict=feed_dict) train_writer.add_summary(summary, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, losses, acc)) train_writer.close() test_writer.close()