예제 #1
0
 def load_data_and_labels(self):
     # Load data
     print("Loading embedding and dataset...")
     embedding_name = None
     dataset_name = self.cfg["datasets"]["default"]
     if FLAGS.enable_word_embeddings and self.cfg['word_embeddings'][
             'default'] is not None:
         embedding_name = self.cfg['word_embeddings']['default']
         embedding_dimension = self.cfg['word_embeddings'][embedding_name][
             'dimension']
     else:
         embedding_dimension = FLAGS.embedding_dim
     datasets = None
     if dataset_name == "mrpolarity":
         datasets = data_helpers.get_datasets_mrpolarity(
             self.cfg["datasets"][dataset_name]["positive_data_file"]
             ["path"], self.cfg["datasets"][dataset_name]
             ["negative_data_file"]["path"])
     elif dataset_name == "20newsgroup":
         datasets = data_helpers.get_datasets_20newsgroup(
             subset="train",
             categories=self.cfg["datasets"][dataset_name]["categories"],
             shuffle=self.cfg["datasets"][dataset_name]["shuffle"],
             random_state=self.cfg["datasets"][dataset_name]
             ["random_state"])
     elif dataset_name == "localdata":
         datasets = data_helpers.get_datasets_localdata(
             container_path=self.cfg["datasets"][dataset_name]
             ["container_path"],
             categories=self.cfg["datasets"][dataset_name]["categories"],
             shuffle=self.cfg["datasets"][dataset_name]["shuffle"],
             random_state=self.cfg["datasets"][dataset_name]
             ["random_state"])
     print("Loaded dataset: {}. [Embedding: {}].\nLoading labels".format(
         dataset_name, embedding_name))
     x_text, y = data_helpers.load_data_labels(datasets)
     self.x_text = x_text
     self.y = y
     self.embedding_name = embedding_name
     self.embedding_dimension = embedding_dimension
     self.dataset_name = dataset_name
     print("Done load_data_and_labels()")
예제 #2
0
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

datasets = None

# CHANGE THIS: Load data. Load your own data here
dataset_name = cfg["datasets"]["default"]
if FLAGS.eval_train:
    if dataset_name == "mrpolarity":
        datasets = data_helpers.get_datasets_mrpolarity(
            cfg["datasets"][dataset_name]["positive_data_file"]["path"],
            cfg["datasets"][dataset_name]["negative_data_file"]["path"])
    elif dataset_name == "20newsgroup":
        datasets = data_helpers.get_datasets_20newsgroup(
            subset="test",
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    elif dataset_name == "abstract":
        datasets = data_helpers.get_datasets_abstract("data/")
    elif dataset_name == 'intents':
        datasets = data_helpers.get_datasets_intentst("data/")
    x_text, y = data_helpers.load_data_labels(datasets)
    x_raw, y_test = data_helpers.load_data_labels(datasets)
    y_test = np.argmax(y_test, axis=1)
    print("Total number of test examples: {}".format(len(y_test)))
예제 #3
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

datasets = None

# CHANGE THIS: Load data. Load your own data here
# Load data via the positive/negatives test data path (command parameter)
if FLAGS.positive_test and FLAGS.negative_test:
    datasets = data_helpers.get_datasets_mrpolarity(FLAGS.positive_test,
                                                    FLAGS.negative_test)
    x_raw, y_test = data_helpers.load_data_labels(datasets)
    y_test = np.argmax(y_test, axis=1)
    print("Total number of test examples: {}".format(len(y_test)))
else:
    dataset_name = cfg["datasets"]["default"]

# Load data via default configuration
if FLAGS.eval_train and datasets is None:
    if dataset_name == "mrpolarity" or dataset_name == "argoUML":
        datasets = data_helpers.get_datasets_mrpolarity(
            cfg["datasets"][dataset_name]["test_positive_data_file"]["path"],
            cfg["datasets"][dataset_name]["test_negative_data_file"]["path"])
    elif dataset_name == "20newsgroup":
        datasets = data_helpers.get_datasets_20newsgroup(
            subset="test",
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn
import gensim
from sklearn.model_selection import train_test_split

positive_data = 'positive.txt'
negative_data = 'negative.txt'
word2vec_model_path = 'filtered.bin'
word2vec_binary = True

print('Loading data...')
dataset = data_helpers.get_datasets_mrpolarity(positive_data, negative_data)
print('Data successfully laded!\n')

print('Making labels...')
x_data, y = data_helpers.load_data_labels(dataset)

print(len(y))

max_document_length = max([len(x.split(" ")) for x in x_data])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_data)))

#shuffling data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
def main():
    import time
    start_time = time.time()

    FLAGS = flagClass()

    with open("config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

    dataset_name = cfg["datasets"]["default"]
    if FLAGS.enable_word_embeddings and cfg['word_embeddings'][
            'default'] is not None:
        embedding_name = cfg['word_embeddings']['default']
        embedding_dimension = cfg['word_embeddings'][embedding_name][
            'dimension']
    else:
        embedding_dimension = FLAGS.embedding_dim

    # Data Preparation
    # ==================================================

    # Load data

    print("Loading data...")
    datasets = None
    if dataset_name == "mrpolarity":
        datasets = data_helpers.get_datasets_mrpolarity(
            cfg["datasets"][dataset_name]["positive_data_file"]["path"],
            cfg["datasets"][dataset_name]["negative_data_file"]["path"])
    elif dataset_name == 'spamham':
        datasets = data_helpers.get_datasets_mrpolarity(
            cfg["datasets"][dataset_name]["spam_file"]["path"],
            cfg["datasets"][dataset_name]["ham_file"]["path"])
    elif dataset_name == "20newsgroup":
        datasets = data_helpers.get_datasets_20newsgroup(
            subset="train",
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    elif dataset_name == "dbpedia":
        datasets = data_helpers.get_datasets_dbpedia(
            cfg["datasets"][dataset_name]["train_file"]["path"],
            cfg["datasets"][dataset_name]["train_file"]["limit"])
    elif dataset_name == "email":
        datasets = data_helpers.get_datasets_email(
            container_path=cfg["datasets"][dataset_name]["container_path"],
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    elif dataset_name == "localdata":
        datasets = data_helpers.get_datasets_localdata(
            container_path=cfg["datasets"][dataset_name]["container_path"],
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    x_text, y = data_helpers.load_data_labels(datasets)

    # Build vocabulary

    # To limit memory usage, you can cut off input text to first 40 words
    # Other research has shown that first 40 words in text (IMDB dataset?)
    # were representative of the content of the sentence for classification
    # purposes - Comment out one of the two lines below

    # max_document_length = max([len(x.split(" ")) for x in x_text])
    max_document_length = 40  # read up to 40 words from each sentence
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    print('Sequence_length={}'.format(x_train.shape[1]))

    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=embedding_dimension,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(cnn.learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())
            if FLAGS.enable_word_embeddings and cfg['word_embeddings'][
                    'default'] is not None:
                vocabulary = vocab_processor.vocabulary_
                initW = None
                if embedding_name == 'word2vec':
                    # load embedding vectors from the word2vec
                    print("Load word2vec file {}".format(
                        cfg['word_embeddings']['word2vec']['path']))
                    initW = data_helpers.load_embedding_vectors_word2vec(
                        vocabulary, cfg['word_embeddings']['word2vec']['path'],
                        cfg['word_embeddings']['word2vec']['binary'])
                    print("word2vec file has been loaded")
                elif embedding_name == 'glove':
                    # load embedding vectors from the glove
                    print("Load glove file {}".format(
                        cfg['word_embeddings']['glove']['path']))
                    initW = data_helpers.load_embedding_vectors_glove(
                        vocabulary, cfg['word_embeddings']['glove']['path'],
                        embedding_dimension)
                    print("glove file has been loaded\n")
                sess.run(cnn.W.assign(initW))

            def train_step(x_batch, y_batch, learning_rate):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    cnn.learning_rate: learning_rate
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}, learning_rate {:g}".
                      format(time_str, step, loss, accuracy, learning_rate))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy, gr = sess.run([
                    global_step, dev_summary_op, cnn.loss, cnn.accuracy,
                    cnn.grad
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}, gr {}".format(
                    time_str, step, loss, accuracy, gr))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                              FLAGS.batch_size,
                                              FLAGS.num_epochs)
            print("Number of epochs: {}".format(FLAGS.num_epochs))
            num_batches_per_epoch = int(
                (len(list(zip(x_train, y_train))) - 1) / FLAGS.batch_size) + 1
            print("Batches per epoch: {}".format(num_batches_per_epoch))
            print("Batch size: {}".format(FLAGS.batch_size))
            # It uses dynamic learning rate with a high value at the beginning to speed up the training
            max_learning_rate = 0.005
            min_learning_rate = 0.0001
            decay_speed = FLAGS.decay_coefficient * len(
                y_train) / FLAGS.batch_size
            # Training loop. For each batch...
            counter = 0
            for batch in batches:
                learning_rate = min_learning_rate + (
                    max_learning_rate - min_learning_rate) * math.exp(
                        -counter / decay_speed)
                counter += 1
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch, learning_rate)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("")
                if current_step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
    print("runtime was " + str(time.time() - start_time))