Пример #1
0
def model_selection(model_name):
    if model_name == "cnn":
        return TextCNN(sequence_length=train_x.shape[1],
                       num_classes=train_y.shape[1],
                       vocab_size=len(vocab_processor.vocabulary_),
                       embedding_size=FLAGS.embedding_size,
                       filter_sizes=list(
                           map(int, FLAGS.filter_sizes.split(","))),
                       num_filters=FLAGS.num_filters,
                       l2_reg_lambda=FLAGS.l2_reg_lambda)
    elif model_name == "rnn":
        return TextRNN(sequence_length=max_document_length,
                       num_classes=train_y.shape[1],
                       vocab_size=len(vocab_processor.vocabulary_),
                       embedding_size=FLAGS.embedding_size,
                       learning_rate=FLAGS.learning_rate,
                       batch_size=FLAGS.batch_size,
                       decay_steps=FLAGS.decay_steps,
                       decay_rate=FLAGS.decay_rate,
                       is_training=FLAGS.is_training)
    elif model_name == "rcnn":
        return TextRCNN(sequence_length=train_x.shape[1],
                        num_classes=train_y.shape[1],
                        vocab_size=len(vocab_processor.vocabulary_),
                        embedding_size=FLAGS.embedding_size,
                        context_embedding_size=FLAGS.context_embedding_size,
                        cell_type=FLAGS.cell_type,
                        hidden_size=FLAGS.hidden_size,
                        l2_reg_lambda=FLAGS.l2_reg_lambda)
    elif model_name == "clstm":
        return TextCLSTM(max_len=max_document_length,
                         num_classes=train_y.shape[1],
                         vocab_size=len(vocab_processor.vocabulary_),
                         embedding_size=FLAGS.embedding_size,
                         filter_sizes=list(
                             map(int, FLAGS.filter_sizes.split(","))),
                         num_filters=FLAGS.num_filters,
                         num_layers=FLAGS.num_layers,
                         l2_reg_lambda=FLAGS.l2_reg_lambda)
    else:
        raise NotImplementedError("%s is not implemented" % (model_name))
Пример #2
0
def train():
    with tf.device('/cpu:0'):
        x_text, y = data_helpers.load_data_and_labels(FLAGS.train_dir)

    # Build vocabulary
    # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."
    # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
    # =>
    # [27 39 40 41 42  1 43  0  0 ... 0]
    # dimension = FLAGS.max_sentence_length
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    print("Text Vocabulary Size: {:d}".format(len(
        vocab_processor.vocabulary_)))

    print("x = {0}".format(x.shape))
    print("y = {0}".format(y.shape))
    print("")

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    print("Train/Dev split: {:d}/{:d}\n".format(len(y_train), len(y_dev)))

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            rcnn = TextRCNN(sequence_length=x_train.shape[1],
                            num_classes=y_train.shape[1],
                            vocab_size=len(vocab_processor.vocabulary_),
                            text_embedding_size=FLAGS.text_embedding_dim,
                            context_embedding_size=FLAGS.context_embedding_dim,
                            cell_type=FLAGS.cell_type,
                            hidden_size=FLAGS.hidden_size,
                            l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
                rcnn.loss, global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", rcnn.loss)
            acc_summary = tf.summary.scalar("accuracy", rcnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Pre-trained word2vec
            if FLAGS.word2vec:
                # initial matrix with random uniform
                initW = np.random.uniform(-0.25, 0.25, (len(
                    vocab_processor.vocabulary_), FLAGS.text_embedding_dim))
                # load any vectors from the word2vec
                print("Load word2vec file {0}".format(FLAGS.word2vec))
                with open(FLAGS.word2vec, "rb") as f:
                    header = f.readline()
                    vocab_size, layer1_size = map(int, header.split())
                    binary_len = np.dtype('float32').itemsize * layer1_size
                    for line in range(vocab_size):
                        word = []
                        while True:
                            ch = f.read(1).decode('latin-1')
                            if ch == ' ':
                                word = ''.join(word)
                                break
                            if ch != '\n':
                                word.append(ch)
                        idx = vocab_processor.vocabulary_.get(word)
                        if idx != 0:
                            initW[idx] = np.fromstring(f.read(binary_len),
                                                       dtype='float32')
                        else:
                            f.read(binary_len)
                sess.run(rcnn.W_text.assign(initW))
                print("Success to load pre-trained word2vec model!\n")

            # Generate batches
            batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                              FLAGS.batch_size,
                                              FLAGS.num_epochs)
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)

                # Train
                feed_dict = {
                    rcnn.input_text: x_batch,
                    rcnn.input_y: y_batch,
                    rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, rcnn.loss,
                    rcnn.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % FLAGS.display_every == 0:
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))

                # Evaluation
                if step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    feed_dict_dev = {
                        rcnn.input_text: x_dev,
                        rcnn.input_y: y_dev,
                        rcnn.dropout_keep_prob: 1.0
                    }
                    summaries_dev, loss, accuracy, predictions = sess.run([
                        dev_summary_op, rcnn.loss, rcnn.accuracy,
                        rcnn.predictions
                    ], feed_dict_dev)
                    dev_summary_writer.add_summary(summaries_dev, step)

                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))
                    print(
                        "(2*9+1)-Way Macro-Average F1 Score (excluding Other): {:g}\n"
                        .format(
                            f1_score(np.argmax(y_dev, axis=1),
                                     predictions,
                                     labels=np.array(range(1, 19)),
                                     average="macro")))

                # Model checkpoint
                if step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=step)
                    print("Saved model checkpoint to {}\n".format(path))
def train():
    with tf.device('/cpu:0'):
        x_text, y = data_helpers.load_data_and_labels(FLAGS.pos_dir,
                                                      FLAGS.neg_dir)

    text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        FLAGS.max_sentence_length)
    x = np.array(list(text_vocab_processor.fit_transform(x_text)))
    print("Text Vocabulary Size: {:d}".format(
        len(text_vocab_processor.vocabulary_)))

    print("x = {0}".format(x.shape))
    print("y = {0}".format(y.shape))
    print("")
    print(type(x))
    print(type(y))
    pos_arrays, neg_arrays = np.array_split(x, 2)
    pos_labels, neg_labels = np.array_split(y, 2)

    # Randomly shuffle data
    np.random.seed(10)
    total = len(y) / 2
    np.random.shuffle(pos_arrays)
    np.random.shuffle(neg_arrays)
    np.random.shuffle(pos_labels)
    np.random.shuffle(neg_labels)
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(total))
    test_sample_index = -1 * int(FLAGS.test_sample_percentage * float(total))
    training_pos, val_pos, test_pos = pos_arrays[:dev_sample_index +
                                                 test_sample_index], pos_arrays[
                                                     dev_sample_index +
                                                     test_sample_index:
                                                     test_sample_index], pos_arrays[
                                                         test_sample_index:]
    training_neg, val_neg, test_neg = neg_arrays[:dev_sample_index +
                                                 test_sample_index], neg_arrays[
                                                     dev_sample_index +
                                                     test_sample_index:
                                                     test_sample_index], neg_arrays[
                                                         test_sample_index:]

    training_pos_label, val_pos_label, test_pos_label = pos_labels[:dev_sample_index + test_sample_index], pos_labels[
        dev_sample_index +
        test_sample_index:test_sample_index], pos_labels[test_sample_index:]
    training_neg_label, val_neg_label, test_neg_label = neg_labels[:dev_sample_index + test_sample_index], neg_labels[
        dev_sample_index +
        test_sample_index:test_sample_index], neg_labels[test_sample_index:]

    x_train = np.concatenate((training_pos, training_neg))
    x_dev = np.concatenate((val_pos, val_neg))
    x_test = np.concatenate((test_pos, test_neg))

    y_train = np.concatenate((training_pos_label, training_neg_label))
    y_dev = np.concatenate((val_pos_label, val_neg_label))
    y_test = np.concatenate((test_pos_label, test_neg_label))

    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_train_shuffled = x_train[shuffle_indices]
    y_train_shuffled = y_train[shuffle_indices]
    x_train = x_train_shuffled
    y_train = y_train_shuffled

    print("Train/Dev/Test split: {:d}/{:d}/{:d}\n".format(
        len(y_train), len(y_dev), len(y_test)))  #TJ test

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    '''
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))  #TJ test
    test_sample_index = -1 * int(FLAGS.test_sample_percentage * float(len(y)))  #TJ test
    x_train, x_dev, x_test  = x_shuffled[:dev_sample_index+test_sample_index], x_shuffled[dev_sample_index+test_sample_index:test_sample_index], x_shuffled[test_sample_index:] #TJ test
    y_train, y_dev, y_test = y_shuffled[:dev_sample_index+test_sample_index], y_shuffled[dev_sample_index+test_sample_index:test_sample_index], y_shuffled[test_sample_index:] #TJ test
    '''
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            rcnn = TextRCNN(sequence_length=x_train.shape[1],
                            num_classes=y_train.shape[1],
                            vocab_size=len(text_vocab_processor.vocabulary_),
                            word_embedding_size=FLAGS.word_embedding_dim,
                            context_embedding_size=FLAGS.context_embedding_dim,
                            cell_type=FLAGS.cell_type,
                            hidden_size=FLAGS.hidden_size,
                            l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            train_op = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(
                rcnn.loss, global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs-ns", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", rcnn.loss)
            acc_summary = tf.summary.scalar("accuracy", rcnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Test summaries-TJ
            test_summary_op = tf.summary.merge([loss_summary, acc_summary])
            test_summary_dir = os.path.join(out_dir, "summaries", "test")
            test_summary_writer = tf.summary.FileWriter(
                test_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            text_vocab_processor.save(os.path.join(out_dir, "text_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Pre-trained word2vec
            if FLAGS.word2vec:
                # initial matrix with random uniform
                initW = np.random.uniform(
                    -0.25, 0.25, (len(text_vocab_processor.vocabulary_),
                                  FLAGS.word_embedding_dim))
                # load any vectors from the word2vec
                print("Load word2vec file {0}".format(FLAGS.word2vec))
                with open(FLAGS.word2vec, "rb") as f:
                    header = f.readline()
                    vocab_size, layer1_size = map(int, header.split())
                    binary_len = np.dtype('float32').itemsize * layer1_size
                    for line in range(vocab_size):
                        word = []
                        while True:
                            ch = f.read(1).decode('latin-1')
                            if ch == ' ':
                                word = ''.join(word)
                                break
                            if ch != '\n':
                                word.append(ch)
                        idx = text_vocab_processor.vocabulary_.get(word)
                        if idx != 0:
                            initW[idx] = np.fromstring(f.read(binary_len),
                                                       dtype='float32')
                        else:
                            f.read(binary_len)
                sess.run(rcnn.W_text.assign(initW))
                print("Success to load pre-trained word2vec model!\n")

            # Generate batches
            batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                              FLAGS.batch_size,
                                              FLAGS.num_epochs)
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                # Train
                feed_dict = {
                    rcnn.input_text: x_batch,
                    rcnn.input_y: y_batch,
                    rcnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, rcnn.loss,
                    rcnn.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % FLAGS.display_every == 0:
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))

                # Evaluation
                if step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    feed_dict_dev = {
                        rcnn.input_text: x_dev,
                        rcnn.input_y: y_dev,
                        rcnn.dropout_keep_prob: 1.0
                    }  #TJ precision, recall, f1
                    summaries_dev, loss, accuracy, precision, recall, f1 = sess.run(
                        [
                            dev_summary_op, rcnn.loss, rcnn.accuracy,
                            rcnn.precision, rcnn.recall, rcnn.f1
                        ], feed_dict_dev)

                    dev_summary_writer.add_summary(summaries_dev, step)

                    time_str = datetime.datetime.now().isoformat()
                    print(
                        "{}: step {}, loss {:g}, acc {:g}, precision {:g}, recall {:g}, f1 {:g}\n"
                        .format(time_str, step, loss, accuracy, precision,
                                recall, f1))

            # Test - TJ
                if step % FLAGS.evaluate_every == 0:
                    print("\nTesting:")
                    feed_dict_test = {
                        rcnn.input_text: x_test,
                        rcnn.input_y: y_test,
                        rcnn.dropout_keep_prob: 1.0
                    }  #TJ precision, recall, f1
                    summaries_test, loss, accuracy, precision, recall, f1 = sess.run(
                        [
                            test_summary_op, rcnn.loss, rcnn.accuracy,
                            rcnn.precision, rcnn.recall, rcnn.f1
                        ], feed_dict_test)

                    test_summary_writer.add_summary(summaries_test, step)

                    time_str = datetime.datetime.now().isoformat()
                    print(
                        "test part {}: step {}, loss {:g}, acc {:g}, precision {:g}, recall {:g}, f1 {:g}\n"
                        .format(time_str, step, loss, accuracy, precision,
                                recall, f1))

                # Model checkpoint
                if step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=step)
                    print("Saved model checkpoint to {}\n".format(path))