def test_session(self, session, inds): #for debugging purposes only
        #take random sample of training set:

        # inds = np.random.choice(self.training_set, math.floor(0.01*len(self.training_set)), replace=False)
        small_s = generate_padded_seq(self.config.max_length, self.config.output_size, inds)
        small_s_y = [i[1:] for i in small_s]
        seq_len = [len(i) for i in inds]
        masks = get_masks(inds, self.config.max_length)

        feed = self.create_feed_dict(inputs_batch= small_s, labels_batch= small_s_y, dropout= self.config.drop_out, mask_batch=masks, seq_length = seq_len)

        loss = session.run([self.loss], feed_dict=feed)
        return loss
    def train_on_batch(self, sess, batch):

        batch_x = generate_padded_seq(self.config.max_length, self.config.output_size, batch)

        batch_y = [i[1:] for i in batch_x]

        seq_len = [len(i) for i in batch]

        masks = get_masks(batch, self.config.max_length)

        feed = self.create_feed_dict(inputs_batch=batch_x, labels_batch= batch_y, dropout= self.config.drop_out, mask_batch=masks, seq_length = seq_len)

        _, loss = sess.run([self.train_op, self.loss], feed_dict=feed)
        # loss = sess.run(self.error, feed_dict=feed)
        # pred = sess.run(self.probs, feed_dict=feed)
        # return pred
        return _, loss
Пример #3
0
def train(args):
    n_epochs = 30
    embeddings = get_embeddings(
        embed_path='./data/new_embeddings_final_filtered.pkl')
    # embeddings = np.load('./data/final_large_weights.npy')
    # embeddings = np.vstack([embeddings, np.zeros(embeddings.shape[1])])
    all_dat = collections.defaultdict(list)
    raw_data = get_data(path='./data/2015_data_tokenzed.pkl')
    for r, post in raw_data:
        all_dat[r].append(post)

    # vocabs = collections.defaultdict(str)

    # with open('./data/large_vocab') as csvfile:
    #     vocab = csv.reader(csvfile)
    #     for v in vocab:
    #         vocabs[v[1]] = v[0]

    #get vocab:
    with open('./data/large_vocab_final_filtered.pkl', 'rb') as f:
        vocabs = cPickle.load(f)
        f.close()

    vocabs = collections.defaultdict(str, vocabs)

    def get_indices(sent):
        return [vocabs[i] for i in sent]

    vocabs_reversed = {v: k for k, v in vocabs.iteritems()}

    def get_words(sent):
        return [vocabs_reversed[i] for i in sent]

    r = args.subreddit
    sample = np.array([get_indices(j) for j in all_dat[r]])
    # subsample_y = [get_indices(j) for j[1:] in all_dat['personalfinance']][0:100]

    #seq_length, max_length, embed_size, output_size
    max_length = max([len(se) for se in sample])
    config_file = Config(max_length=max_length,
                         embed_size=embeddings.shape[1],
                         output_size=embeddings.shape[0],
                         batch_size=128,
                         drop_out=args.dropout,
                         learning_rate=args.learningrate,
                         hidden_unit_size=args.hiddensize)

    idx = np.arange(len(sample))

    train_inds, dev_inds, test_inds = get_dev_test_sets(
        dev_size=config_file.dev_set_size,
        test_size=config_file.test_set_size,
        training_indices=idx)

    train, dev, test = sample[train_inds], sample[dev_inds], sample[test_inds]

    with tf.Graph().as_default():
        m = RNN_LSTM(embeddings=embeddings, config=config_file)
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as sess:
            sess.run(init)
            # loss = m.test_session(sess, train)
            best_perplexity = np.inf
            for epoch in range(n_epochs):
                print "Epoch: " + str(epoch + 1)

                m.run_epoch(sess, np.array(train))

                # # #evaluate training perplexity
                test_size = len(dev)

                total_perplexity = 0
                total_batches = 0
                for k, indices in enumerate(get_batch(test_size, 100)):

                    total_batches += 1
                    test_batch = dev[indices]
                    masks = get_masks(test_batch, config_file.max_length)

                    seq_len = [len(i) for i in test_batch]
                    batch_x = generate_padded_seq(config_file.max_length,
                                                  config_file.output_size,
                                                  test_batch)
                    batch_y = [i[1:] for i in batch_x]
                    feed = m.create_feed_dict(inputs_batch=batch_x,
                                              labels_batch=batch_y,
                                              dropout=config_file.drop_out,
                                              mask_batch=masks,
                                              seq_length=seq_len)

                    perplexities = sess.run(m.error, feed_dict=feed)
                    total_perplexity += perplexities
                    # seq_inds = np.arange(len(seq_len))
                    # print "Average Perplexity Across Entire Set: " + str(sum([np.prod(perplexities[i][0:seq_len[i]])**(-1/seq_len[i]) for i in seq_inds])/len(seq_inds))
                    print "Epoch: " + str(
                        epoch +
                        1) + " average test perplexity for batch " + str(
                            k + 1) + ':' + str(perplexities)

                if (total_perplexity / total_batches) < best_perplexity:
                    best_perplexity = (total_perplexity / total_batches)
                    print "New Best Perplexity: " + str(best_perplexity)
            saver.save(
                sess, "./code/trainer/models/" + r + "/epoch_" +
                str(epoch + 1) + ".ckpt")

            with open('./code/trainer/diag/diagnostics.csv', 'a') as diag_out:
                csv_diag_out = csv.writer(diag_out)
                csv_diag_out.writerow([
                    args.subreddit,
                    str(best_perplexity),
                    str(config_file.hidden_unit_size),
                    str(config_file.learning_rate),
                    str(config_file.embed_size)
                ])
        feed = self.create_feed_dict(inputs_batch=batch_x, labels_batch= batch_y, dropout= self.config.drop_out, mask_batch=masks, seq_length = seq_len)

        _, loss = sess.run([self.train_op, self.loss], feed_dict=feed)
        # loss = sess.run(self.error, feed_dict=feed)
        # pred = sess.run(self.probs, feed_dict=feed)
        # return pred
        return _, loss

    def train_on_batch_single(self, sess, batch):

        total_loss = 0.

        max_len = max(len(case) for case in batch)
        padded = generate_padded_seq(max_len, self.config.output_size, batch)
        masks = np.matrix(get_masks(batch, max_len))
        batch_x = [i[:-1] for i in padded]
        batch_y = [i[1:] for i in padded]

        #make the batches into a matrix so that we can have easier time feeding

        batch_x_mat = np.matrix(batch_x)
        batch_y_mat = np.matrix(batch_y)

        assert batch_x_mat.shape[1] == batch_y_mat.shape[1], "x and y are not the same length. x: " +str(batch_x_mat.shape[1]) + ". y: " + str(batch_y_mat.shape[1])

        for i in range(batch_x_mat.shape[1]):

            x = batch_x_mat[:,i]
            y = batch_y_mat[:,i]
            m = masks[:,i]
Пример #5
0
def train(args):
    n_epochs = 100
    embeddings = get_embeddings(
        embed_path='./data/new_embeddings_final_filtered.pkl')
    # embeddings = np.load('./data/final_large_weights.npy')
    # embeddings = np.vstack([embeddings, np.zeros(embeddings.shape[1])])
    all_dat = collections.defaultdict(list)
    raw_data = get_data(path='./data/2015_data_tokenzed.pkl')
    for r, post in raw_data:
        all_dat[r].append(post)

    # vocabs = collections.defaultdict(str)

    # with open('./data/large_vocab') as csvfile:
    #     vocab = csv.reader(csvfile)
    #     for v in vocab:
    #         vocabs[v[1]] = v[0]

    #get vocab:
    with open('./data/large_vocab_final_filtered.pkl', 'rb') as f:
        vocabs = cPickle.load(f)
        f.close()

    vocabs = collections.defaultdict(str, vocabs)

    def get_indices(sent):
        return [vocabs[i] for i in sent]

    vocabs_reversed = {v: k for k, v in vocabs.iteritems()}

    def get_words(sent):
        return [vocabs_reversed[i] for i in sent]

    r = args.subreddit
    sample = np.array([get_indices(j) for j in all_dat[r]])
    # subsample_y = [get_indices(j) for j[1:] in all_dat['personalfinance']][0:100]
    max_length = max(len(i) for i in sample)

    #seq_length, max_length, embed_size, output_size
    config_file = Config(drop_out=args.dropout,
                         max_length=max_length,
                         embed_size=embeddings.shape[1],
                         output_size=embeddings.shape[0],
                         batch_size=256,
                         learning_rate=args.learningrate,
                         hidden_unit_size=args.hiddensize,
                         num_layers=args.numlayers,
                         sequence_length=args.seqlength,
                         peepholes=args.peephole)

    idx = np.arange(len(sample))

    train_inds, dev_inds, test_inds = get_dev_test_sets(
        dev_size=config_file.dev_set_size,
        test_size=config_file.test_set_size,
        training_indices=idx)

    train, dev, test = sample[train_inds], sample[dev_inds], sample[test_inds]

    with tf.Graph().as_default():
        m = RNN_LSTM(embeddings=embeddings, config=config_file)
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as sess:
            sess.run(init)
            # loss = m.test_session(sess, train)
            best_perplexity = np.inf
            for epoch in range(n_epochs):
                print "Epoch: " + str(epoch + 1)

                m.run_epoch(sess, np.array(train))

                # # #evaluate test perplexity
                test_size = len(dev)
                total_perplexity = 0
                total_batches = 0

                for k, indices in enumerate(get_batch(test_size, 100)):

                    total_batches += 1
                    test_batch = dev[indices]

                    max_len = max(len(case) for case in test_batch)
                    padded = generate_padded_seq(max_len,
                                                 config_file.output_size,
                                                 test_batch)
                    masks = np.matrix(get_masks(test_batch, max_len))

                    batch_x = [i[:-1] for i in padded]
                    batch_y = [i[1:] for i in padded]

                    batch_x_mat = np.matrix(batch_x)
                    batch_y_mat = np.matrix(batch_y)

                    # batch_perplexity = 0

                    batch_loss = 0.
                    sequences = get_sequence(
                        max_len, sequence_length=config_file.sequence_length)

                    for bat in sequences:
                        x = batch_x_mat[:, bat]
                        y = batch_y_mat[:, bat]
                        batch_mask = masks[:, bat]
                        feed = m.create_feed_dict(inputs_batch=x,
                                                  labels_batch=y,
                                                  dropout=config_file.drop_out,
                                                  mask_batch=batch_mask,
                                                  seq_length=[1] *
                                                  len(test_batch))

                        loss = sess.run(m.loss, feed_dict=feed)
                        # perplexities = sess.run(m.error, feed_dict=feed)
                        # print "Single word-pair perplexity: " + str(perplexities)
                        batch_loss += loss
                    batch_loss = batch_loss / len(sequences)
                    batch_perplexity = batch_loss**2
                    total_perplexity += batch_perplexity

                    print "Epoch " + str(
                        epoch + 1) + " Total test perplexity for batch " + str(
                            k + 1) + ' :' + str(batch_perplexity)

                if total_perplexity < best_perplexity:
                    best_perplexity = total_perplexity
                    print "New Best Perplexity: " + str(best_perplexity)
            saver.save(
                sess, "./code/trainer/models/" + r.lower() + "/single_epoch_" +
                str(epoch + 1) + ".ckpt")

            with open('./code/trainer/diag/diagnostics_new_final.csv',
                      'a') as diag_out:
                csv_diag_out = csv.writer(diag_out)
                csv_diag_out.writerow([
                    args.subreddit,
                    str(config_file.peephole),
                    str(best_perplexity),
                    str(config_file.drop_out),
                    str(config_file.hidden_unit_size),
                    str(config_file.learning_rate),
                    str(config_file.embed_size),
                    str(config_file.sequence_length)
                ])