def model_selection(model_name): if model_name == "cnn": return TextCNN(sequence_length=train_x.shape[1], num_classes=train_y.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_size, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, l2_reg_lambda=FLAGS.l2_reg_lambda) elif model_name == "rnn": return TextRNN(sequence_length=max_document_length, num_classes=train_y.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_size, learning_rate=FLAGS.learning_rate, batch_size=FLAGS.batch_size, decay_steps=FLAGS.decay_steps, decay_rate=FLAGS.decay_rate, is_training=FLAGS.is_training) elif model_name == "rcnn": return TextRCNN(sequence_length=train_x.shape[1], num_classes=train_y.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_size, context_embedding_size=FLAGS.context_embedding_size, cell_type=FLAGS.cell_type, hidden_size=FLAGS.hidden_size, l2_reg_lambda=FLAGS.l2_reg_lambda) elif model_name == "clstm": return TextCLSTM(max_len=max_document_length, num_classes=train_y.shape[1], vocab_size=len(vocab_processor.vocabulary_), embedding_size=FLAGS.embedding_size, filter_sizes=list( map(int, FLAGS.filter_sizes.split(","))), num_filters=FLAGS.num_filters, num_layers=FLAGS.num_layers, l2_reg_lambda=FLAGS.l2_reg_lambda) else: raise NotImplementedError("%s is not implemented" % (model_name))
def train(): with tf.device('/cpu:0'): x_text, y = data_helpers.load_data_and_labels(FLAGS.train_dir) # Build vocabulary # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>." # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>'] # => # [27 39 40 41 42 1 43 0 0 ... 0] # dimension = FLAGS.max_sentence_length vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) x = np.array(list(vocab_processor.fit_transform(x_text))) print("Text Vocabulary Size: {:d}".format(len( vocab_processor.vocabulary_))) print("x = {0}".format(x.shape)) print("y = {0}".format(y.shape)) print("") # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev))) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): rcnn = TextRCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(vocab_processor.vocabulary_), text_embedding_size=FLAGS.text_embedding_dim, context_embedding_size=FLAGS.context_embedding_dim, cell_type=FLAGS.cell_type, hidden_size=FLAGS.hidden_size, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( rcnn.loss, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", rcnn.loss) acc_summary = tf.summary.scalar("accuracy", rcnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) # Pre-trained word2vec if FLAGS.word2vec: # initial matrix with random uniform initW = np.random.uniform(-0.25, 0.25, (len( vocab_processor.vocabulary_), FLAGS.text_embedding_dim)) # load any vectors from the word2vec print("Load word2vec file {0}".format(FLAGS.word2vec)) with open(FLAGS.word2vec, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in range(vocab_size): word = [] while True: ch = f.read(1).decode('latin-1') if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = vocab_processor.vocabulary_.get(word) if idx != 0: initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) sess.run(rcnn.W_text.assign(initW)) print("Success to load pre-trained word2vec model!\n") # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) # Train feed_dict = { rcnn.input_text: x_batch, rcnn.input_y: y_batch, rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, rcnn.loss, rcnn.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") feed_dict_dev = { rcnn.input_text: x_dev, rcnn.input_y: y_dev, rcnn.dropout_keep_prob: 1.0 } summaries_dev, loss, accuracy, predictions = sess.run([ dev_summary_op, rcnn.loss, rcnn.accuracy, rcnn.predictions ], feed_dict_dev) dev_summary_writer.add_summary(summaries_dev, step) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) print( "(2*9+1)-Way Macro-Average F1 Score (excluding Other): {:g}\n" .format( f1_score(np.argmax(y_dev, axis=1), predictions, labels=np.array(range(1, 19)), average="macro"))) # Model checkpoint if step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path))
def train(): with tf.device('/cpu:0'): x_text, y = data_helpers.load_data_and_labels(FLAGS.pos_dir, FLAGS.neg_dir) text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor( FLAGS.max_sentence_length) x = np.array(list(text_vocab_processor.fit_transform(x_text))) print("Text Vocabulary Size: {:d}".format( len(text_vocab_processor.vocabulary_))) print("x = {0}".format(x.shape)) print("y = {0}".format(y.shape)) print("") print(type(x)) print(type(y)) pos_arrays, neg_arrays = np.array_split(x, 2) pos_labels, neg_labels = np.array_split(y, 2) # Randomly shuffle data np.random.seed(10) total = len(y) / 2 np.random.shuffle(pos_arrays) np.random.shuffle(neg_arrays) np.random.shuffle(pos_labels) np.random.shuffle(neg_labels) dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(total)) test_sample_index = -1 * int(FLAGS.test_sample_percentage * float(total)) training_pos, val_pos, test_pos = pos_arrays[:dev_sample_index + test_sample_index], pos_arrays[ dev_sample_index + test_sample_index: test_sample_index], pos_arrays[ test_sample_index:] training_neg, val_neg, test_neg = neg_arrays[:dev_sample_index + test_sample_index], neg_arrays[ dev_sample_index + test_sample_index: test_sample_index], neg_arrays[ test_sample_index:] training_pos_label, val_pos_label, test_pos_label = pos_labels[:dev_sample_index + test_sample_index], pos_labels[ dev_sample_index + test_sample_index:test_sample_index], pos_labels[test_sample_index:] training_neg_label, val_neg_label, test_neg_label = neg_labels[:dev_sample_index + test_sample_index], neg_labels[ dev_sample_index + test_sample_index:test_sample_index], neg_labels[test_sample_index:] x_train = np.concatenate((training_pos, training_neg)) x_dev = np.concatenate((val_pos, val_neg)) x_test = np.concatenate((test_pos, test_neg)) y_train = np.concatenate((training_pos_label, training_neg_label)) y_dev = np.concatenate((val_pos_label, val_neg_label)) y_test = np.concatenate((test_pos_label, test_neg_label)) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_train_shuffled = x_train[shuffle_indices] y_train_shuffled = y_train[shuffle_indices] x_train = x_train_shuffled y_train = y_train_shuffled print("Train/Dev/Test split: {:d}/{:d}/{:d}\n".format( len(y_train), len(y_dev), len(y_test))) #TJ test # Split train/test set # TODO: This is very crude, should use cross-validation ''' dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) #TJ test test_sample_index = -1 * int(FLAGS.test_sample_percentage * float(len(y))) #TJ test x_train, x_dev, x_test = x_shuffled[:dev_sample_index+test_sample_index], x_shuffled[dev_sample_index+test_sample_index:test_sample_index], x_shuffled[test_sample_index:] #TJ test y_train, y_dev, y_test = y_shuffled[:dev_sample_index+test_sample_index], y_shuffled[dev_sample_index+test_sample_index:test_sample_index], y_shuffled[test_sample_index:] #TJ test ''' with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): rcnn = TextRCNN(sequence_length=x_train.shape[1], num_classes=y_train.shape[1], vocab_size=len(text_vocab_processor.vocabulary_), word_embedding_size=FLAGS.word_embedding_dim, context_embedding_size=FLAGS.context_embedding_dim, cell_type=FLAGS.cell_type, hidden_size=FLAGS.hidden_size, l2_reg_lambda=FLAGS.l2_reg_lambda) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( rcnn.loss, global_step=global_step) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath( os.path.join(os.path.curdir, "runs-ns", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", rcnn.loss) acc_summary = tf.summary.scalar("accuracy", rcnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter( train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Test summaries-TJ test_summary_op = tf.summary.merge([loss_summary, acc_summary]) test_summary_dir = os.path.join(out_dir, "summaries", "test") test_summary_writer = tf.summary.FileWriter( test_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath( os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary text_vocab_processor.save(os.path.join(out_dir, "text_vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) # Pre-trained word2vec if FLAGS.word2vec: # initial matrix with random uniform initW = np.random.uniform( -0.25, 0.25, (len(text_vocab_processor.vocabulary_), FLAGS.word_embedding_dim)) # load any vectors from the word2vec print("Load word2vec file {0}".format(FLAGS.word2vec)) with open(FLAGS.word2vec, "rb") as f: header = f.readline() vocab_size, layer1_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * layer1_size for line in range(vocab_size): word = [] while True: ch = f.read(1).decode('latin-1') if ch == ' ': word = ''.join(word) break if ch != '\n': word.append(ch) idx = text_vocab_processor.vocabulary_.get(word) if idx != 0: initW[idx] = np.fromstring(f.read(binary_len), dtype='float32') else: f.read(binary_len) sess.run(rcnn.W_text.assign(initW)) print("Success to load pre-trained word2vec model!\n") # Generate batches batches = data_helpers.batch_iter(list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) # Train feed_dict = { rcnn.input_text: x_batch, rcnn.input_y: y_batch, rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, rcnn.loss, rcnn.accuracy ], feed_dict) train_summary_writer.add_summary(summaries, step) # Training log display if step % FLAGS.display_every == 0: time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format( time_str, step, loss, accuracy)) # Evaluation if step % FLAGS.evaluate_every == 0: print("\nEvaluation:") feed_dict_dev = { rcnn.input_text: x_dev, rcnn.input_y: y_dev, rcnn.dropout_keep_prob: 1.0 } #TJ precision, recall, f1 summaries_dev, loss, accuracy, precision, recall, f1 = sess.run( [ dev_summary_op, rcnn.loss, rcnn.accuracy, rcnn.precision, rcnn.recall, rcnn.f1 ], feed_dict_dev) dev_summary_writer.add_summary(summaries_dev, step) time_str = datetime.datetime.now().isoformat() print( "{}: step {}, loss {:g}, acc {:g}, precision {:g}, recall {:g}, f1 {:g}\n" .format(time_str, step, loss, accuracy, precision, recall, f1)) # Test - TJ if step % FLAGS.evaluate_every == 0: print("\nTesting:") feed_dict_test = { rcnn.input_text: x_test, rcnn.input_y: y_test, rcnn.dropout_keep_prob: 1.0 } #TJ precision, recall, f1 summaries_test, loss, accuracy, precision, recall, f1 = sess.run( [ test_summary_op, rcnn.loss, rcnn.accuracy, rcnn.precision, rcnn.recall, rcnn.f1 ], feed_dict_test) test_summary_writer.add_summary(summaries_test, step) time_str = datetime.datetime.now().isoformat() print( "test part {}: step {}, loss {:g}, acc {:g}, precision {:g}, recall {:g}, f1 {:g}\n" .format(time_str, step, loss, accuracy, precision, recall, f1)) # Model checkpoint if step % FLAGS.checkpoint_every == 0: path = saver.save(sess, checkpoint_prefix, global_step=step) print("Saved model checkpoint to {}\n".format(path))