def train(args): n_epochs = 30 embeddings = get_embeddings( embed_path='./data/new_embeddings_final_filtered.pkl') # embeddings = np.load('./data/final_large_weights.npy') # embeddings = np.vstack([embeddings, np.zeros(embeddings.shape[1])]) all_dat = collections.defaultdict(list) raw_data = get_data(path='./data/2015_data_tokenzed.pkl') for r, post in raw_data: all_dat[r].append(post) # vocabs = collections.defaultdict(str) # with open('./data/large_vocab') as csvfile: # vocab = csv.reader(csvfile) # for v in vocab: # vocabs[v[1]] = v[0] #get vocab: with open('./data/large_vocab_final_filtered.pkl', 'rb') as f: vocabs = cPickle.load(f) f.close() vocabs = collections.defaultdict(str, vocabs) def get_indices(sent): return [vocabs[i] for i in sent] vocabs_reversed = {v: k for k, v in vocabs.iteritems()} def get_words(sent): return [vocabs_reversed[i] for i in sent] r = args.subreddit sample = np.array([get_indices(j) for j in all_dat[r]]) # subsample_y = [get_indices(j) for j[1:] in all_dat['personalfinance']][0:100] #seq_length, max_length, embed_size, output_size max_length = max([len(se) for se in sample]) config_file = Config(max_length=max_length, embed_size=embeddings.shape[1], output_size=embeddings.shape[0], batch_size=128, drop_out=args.dropout, learning_rate=args.learningrate, hidden_unit_size=args.hiddensize) idx = np.arange(len(sample)) train_inds, dev_inds, test_inds = get_dev_test_sets( dev_size=config_file.dev_set_size, test_size=config_file.test_set_size, training_indices=idx) train, dev, test = sample[train_inds], sample[dev_inds], sample[test_inds] with tf.Graph().as_default(): m = RNN_LSTM(embeddings=embeddings, config=config_file) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) # loss = m.test_session(sess, train) best_perplexity = np.inf for epoch in range(n_epochs): print "Epoch: " + str(epoch + 1) m.run_epoch(sess, np.array(train)) # # #evaluate training perplexity test_size = len(dev) total_perplexity = 0 total_batches = 0 for k, indices in enumerate(get_batch(test_size, 100)): total_batches += 1 test_batch = dev[indices] masks = get_masks(test_batch, config_file.max_length) seq_len = [len(i) for i in test_batch] batch_x = generate_padded_seq(config_file.max_length, config_file.output_size, test_batch) batch_y = [i[1:] for i in batch_x] feed = m.create_feed_dict(inputs_batch=batch_x, labels_batch=batch_y, dropout=config_file.drop_out, mask_batch=masks, seq_length=seq_len) perplexities = sess.run(m.error, feed_dict=feed) total_perplexity += perplexities # seq_inds = np.arange(len(seq_len)) # print "Average Perplexity Across Entire Set: " + str(sum([np.prod(perplexities[i][0:seq_len[i]])**(-1/seq_len[i]) for i in seq_inds])/len(seq_inds)) print "Epoch: " + str( epoch + 1) + " average test perplexity for batch " + str( k + 1) + ':' + str(perplexities) if (total_perplexity / total_batches) < best_perplexity: best_perplexity = (total_perplexity / total_batches) print "New Best Perplexity: " + str(best_perplexity) saver.save( sess, "./code/trainer/models/" + r + "/epoch_" + str(epoch + 1) + ".ckpt") with open('./code/trainer/diag/diagnostics.csv', 'a') as diag_out: csv_diag_out = csv.writer(diag_out) csv_diag_out.writerow([ args.subreddit, str(best_perplexity), str(config_file.hidden_unit_size), str(config_file.learning_rate), str(config_file.embed_size) ])
def generate(args): embeddings = get_embeddings( embed_path='./data/new_embeddings_final_filtered.pkl') # vocabs = collections.defaultdict(str) with open('./data/large_vocab_final_filtered.pkl', 'rb') as f: vocabs = cPickle.load(f) f.close() vocabs = collections.defaultdict(str, vocabs) # with open('./data/large_vocab') as csvfile: # vocab = csv.reader(csvfile) # for v in vocab: # vocabs[v[1]] = v[0] vocabs_reversed = {v: k for k, v in vocabs.iteritems()} def get_indices(sent): return [vocabs[i] for i in sent] def get_words(sent): return [vocabs_reversed[i] for i in sent] model = args.model model_path = './code/trainer/models/' + model + '/' c = Config(max_length=1, embed_size=embeddings.shape[1], output_size=embeddings.shape[0], batch_size=36, drop_out=1 ) #max length is 1 becuase we want 1 word generated at a time with tf.Graph().as_default(): m = RNN_LSTM(embeddings=embeddings, config=c) saver = tf.train.Saver() init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) saver.restore(session, tf.train.latest_checkpoint(model_path)) all_sentences = [] # for sent in range(args.numsentences): current_word = '<start>' sentence = [current_word] #get index of <start> token: while current_word != '<end>': current_ind = vocabs[current_word] x = [[current_ind]] feed = m.create_feed_dict(inputs_batch=x, seq_length=[1]) preds = session.run(m.last_state, feed_dict=feed) largest_10_inds = preds.argsort()[::-1][:args.numwords] largest_10_unscaled_p = preds[largest_10_inds] scaled_p = largest_10_unscaled_p / sum(largest_10_unscaled_p) current_ind = np.random.choice(largest_10_inds, p=scaled_p) current_word = vocabs_reversed[current_ind] sentence.append(current_word) # all_sentences.append(' '.join(sentence[1:-1])) print sentence
def generator(args): choices = ['askreddit', 'lifeprotips', 'nottheonion', 'news', 'science', 'trees', 'tifu', 'personalfinance', 'mildlyinteresting', 'interestingasfuck'] embeddings = get_embeddings(embed_path='./data/new_embeddings_final_filtered.pkl') # vocabs = collections.defaultdict(str) with open('./data/large_vocab_final_filtered.pkl', 'rb') as f: vocabs = cPickle.load(f) f.close() vocabs = collections.defaultdict(str, vocabs) # with open('./data/large_vocab') as csvfile: # vocab = csv.reader(csvfile) # for v in vocab: # vocabs[v[1]] = v[0] vocabs_reversed = {v: k for k, v in vocabs.iteritems()} def get_indices(sent): return [vocabs[i] for i in sent] def get_words(sent): return [vocabs_reversed[i] for i in sent] c = Config(max_length = 1, embed_size = embeddings.shape[1], output_size=embeddings.shape[0], batch_size = 36, drop_out=1) #max length is 1 becuase we want 1 word generated at a time with tf.Graph().as_default(): m = RNN_LSTM(embeddings = embeddings, config = c) saver = tf.train.Saver() init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) print "Hello, please select the sureddit from which you would like to generate a post for: \n 1. AskReddit \n 2. LifeProTips \n 3. nottheonion \n 4. news \n 5. science" \ "\n 6. trees \n 7. tifu \n 8. personalfinance \n 9. mildlyinteresting \n 10.interestingasfuck " while True: try: subr = raw_input("subreddit: ") subr = subr.lower() if subr not in choices: print "Please input a correct subreddit." continue else: model_path = './code/trainer/models/' + subr +'/' saver.restore(session, tf.train.latest_checkpoint(model_path)) current_word = '<start>' sentence = [current_word] #get index of <start> token: while current_word != '<end>': current_ind = vocabs[current_word] x = [[current_ind]] feed = m.create_feed_dict(inputs_batch=x, seq_length=[1]) preds = session.run(m.last_state, feed_dict=feed) largest_inds = preds.argsort()[::-1][:100] #top 100 largest_unscaled_p = preds[largest_inds] scaled_p = largest_unscaled_p/sum(largest_unscaled_p) current_ind = np.random.choice(largest_inds, p = scaled_p) current_word = vocabs_reversed[current_ind] sentence.append(current_word) print ' '.join(sentence[1:-1]) continue except EOFError: continue
def train(args): n_epochs = 20 embeddings = get_embeddings(embed_path='./data/new_embeddings_final_filtered.pkl') # embeddings = np.load('./data/final_large_weights.npy') # embeddings = np.vstack([embeddings, np.zeros(embeddings.shape[1])]) all_dat = collections.defaultdict(list) raw_data = get_data(path = './data/2015_data_tokenzed.pkl') for r, post in raw_data: all_dat[r].append(post) # vocabs = collections.defaultdict(str) # with open('./data/large_vocab') as csvfile: # vocab = csv.reader(csvfile) # for v in vocab: # vocabs[v[1]] = v[0] #get vocab: with open('./data/large_vocab_final_filtered.pkl', 'rb') as f: vocabs = cPickle.load(f) f.close() vocabs = collections.defaultdict(str, vocabs) def get_indices(sent): return [vocabs[i] for i in sent] vocabs_reversed = {v: k for k, v in vocabs.iteritems()} def get_words(sent): return [vocabs_reversed[i] for i in sent] r = args.subreddit raw = np.array([['<start>']*args.numpad + j + ['<end>']*args.numpad for j in all_dat[r]]) sample = np.array([get_indices(j) for j in raw]) max_length = max(len(i) for i in sample) #seq_length, max_length, embed_size, output_size config_file = Config(drop_out=args.dropout, max_length = max_length, embed_size = embeddings.shape[1], output_size=embeddings.shape[0], batch_size = 128, learning_rate = args.learningrate, hidden_unit_size=args.hiddensize, num_layers=args.numlayers, numpad = args.numpad) idx = np.arange(len(sample)) train_inds, dev_inds, test_inds = get_dev_test_sets(dev_size = config_file.dev_set_size, test_size = config_file.test_set_size, training_indices = idx) train, dev, test = sample[train_inds], sample[dev_inds], sample[test_inds] with tf.Graph().as_default(): m = RNN_LSTM(embeddings = embeddings, config = config_file) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) # loss = m.test_session(sess, train) best_perplexity = np.inf for epoch in range(n_epochs): print "Epoch: " + str(epoch + 1) m.run_epoch(sess, np.array(train), args.numpad) # # #evaluate test perplexity test_size = len(dev) total_perplexity = 0. total_batches = 0 for k, indices in enumerate(get_batch(test_size, 100)): batch_perplexity = 0. total_batches += 1 test_batch = dev[indices] batch_x =[] batch_y = [] for post in test_batch: for idx in range(len(post)-args.numpad-1): batch_x.append(post[idx:idx+args.numpad+1]) try: batch_y.append(post[idx+args.numpad+2]) except IndexError: batch_y.append(post[idx+args.numpad+1]) #make the batches into a matrix so that we can have easier time feeding batch_x_mat = np.matrix(batch_x) batch_y_mat = np.array(batch_y) batch_y_mat.flatten() feed = m.create_feed_dict(inputs_batch=batch_x_mat, labels_batch= batch_y_mat, dropout= config_file.drop_out) perplexities = sess.run(m.error, feed_dict=feed) print "Single word-pair perplexity: " + str(perplexities) batch_perplexity += perplexities total_perplexity += batch_perplexity print "Epoch " + str(epoch + 1) + " Total test perplexity for batch " + str(k + 1) + ' :' + str(batch_perplexity) if total_perplexity < best_perplexity: best_perplexity = total_perplexity print "New Best Perplexity: " + str(best_perplexity) saver.save(sess, "./code/trainer/models/" + r.lower() + "/single_epoch_" + str(epoch + 1) + ".ckpt") with open('./code/trainer/diag/diagnostics_single_backprop_new.csv', 'a') as diag_out: csv_diag_out = csv.writer(diag_out) csv_diag_out.writerow([args.subreddit, str(best_perplexity), str(config_file.drop_out), str(config_file.hidden_unit_size), str(config_file.learning_rate), str(config_file.embed_size)])
def generate(args): embeddings = get_embeddings( embed_path='./data/new_embeddings_final_filtered.pkl') # vocabs = collections.defaultdict(str) with open('./data/large_vocab_final_filtered.pkl', 'rb') as f: vocabs = cPickle.load(f) f.close() vocabs = collections.defaultdict(str, vocabs) # with open('./data/large_vocab') as csvfile: # vocab = csv.reader(csvfile) # for v in vocab: # vocabs[v[1]] = v[0] vocabs_reversed = {v: k for k, v in vocabs.iteritems()} def get_indices(sent): return [vocabs[i] for i in sent] def get_words(sent): return [vocabs_reversed[i] for i in sent] model = args.model.lower() model_path = './code/trainer/models/' + model + '/' c = Config(max_length=1, embed_size=embeddings.shape[1], output_size=embeddings.shape[0], batch_size=36, num_layers=args.numlayers, drop_out=1, sequence_length=args.seqlength, hidden_unit_size=args.hiddensize, peepholes=args.peephole ) #max length is 1 becuase we want 1 word generated at a time with tf.Graph().as_default(): m = RNN_LSTM(embeddings=embeddings, config=c) saver = tf.train.Saver() init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) saver.restore(session, tf.train.latest_checkpoint(model_path)) all_sentences = [] for sent in range(args.numsentences): current_word = '<start>' sentence = [current_word] #get index of <start> token: cell = np.zeros((1, c.hidden_unit_size)) h = np.zeros((1, c.hidden_unit_size)) while current_word != '<end>': current_ind = vocabs[current_word] x = [[current_ind]] returned = session.run(m.next_word, feed_dict={ m.input_placeholder: x, m.cell_state: cell, m.hidden_state: h }) preds = returned[0] cell = returned[1][0] h = returned[1][1] largest_10_inds = preds.argsort()[::-1][:args.numwords] largest_10_unscaled_p = preds[largest_10_inds] scaled_p = largest_10_unscaled_p / sum( largest_10_unscaled_p) current_ind = np.random.choice(largest_10_inds, p=scaled_p) current_word = vocabs_reversed[current_ind] while len(sentence) < 7 and current_word == "<end>": current_ind = np.random.choice(largest_10_inds, p=scaled_p) current_word = vocabs_reversed[current_ind] sentence.append(current_word) all_sentences.append(' '.join(sentence[1:-1])) with open('./code/trainer/diag/sentences.csv', 'a') as sentence_csv: csvwriter = csv.writer(sentence_csv) for sentence in all_sentences: csvwriter.writerow([ args.model, sentence, args.seqlength, args.hiddensize, args.peephole ])
def train(args): n_epochs = 20 embeddings = get_embeddings( embed_path='./data/new_embeddings_final_filtered.pkl') # embeddings = np.load('./data/final_large_weights.npy') # embeddings = np.vstack([embeddings, np.zeros(embeddings.shape[1])]) all_dat = collections.defaultdict(list) raw_data = get_data(path='./data/2015_data_tokenzed.pkl') for r, post in raw_data: all_dat[r].append(post) # vocabs = collections.defaultdict(str) # with open('./data/large_vocab') as csvfile: # vocab = csv.reader(csvfile) # for v in vocab: # vocabs[v[1]] = v[0] #get vocab: with open('./data/large_vocab_final_filtered.pkl', 'rb') as f: vocabs = cPickle.load(f) f.close() vocabs = collections.defaultdict(str, vocabs) def get_indices(sent): return [vocabs[i] for i in sent] vocabs_reversed = {v: k for k, v in vocabs.iteritems()} def get_words(sent): return [vocabs_reversed[i] for i in sent] r = args.subreddit sample = np.array([get_indices(j) for j in all_dat[r]]) # subsample_y = [get_indices(j) for j[1:] in all_dat['personalfinance']][0:100] max_length = max(len(i) for i in sample) #seq_length, max_length, embed_size, output_size config_file = Config(drop_out=args.dropout, max_length=max_length, embed_size=embeddings.shape[1], output_size=embeddings.shape[0], batch_size=256, learning_rate=args.learningrate, hidden_unit_size=args.hiddensize, num_layers=args.numlayers, sequence_length=args.seqlength, peepholes=args.peephole) idx = np.arange(len(sample)) train_inds, dev_inds, test_inds = get_dev_test_sets( dev_size=config_file.dev_set_size, test_size=config_file.test_set_size, training_indices=idx) train, dev, test = sample[train_inds], sample[dev_inds], sample[test_inds] with tf.Graph().as_default(): m = RNN_LSTM(embeddings=embeddings, config=config_file) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) # loss = m.test_session(sess, train) best_perplexity = np.inf for epoch in range(n_epochs): print "Epoch: " + str(epoch + 1) dev_loss = m.run_epoch(sess, np.array(train), np.array(dev)) perplexity = 2**dev_loss print "Perplexity for Epoch " + str(epoch + 1) + ":" + str(perplexity) saver = tf.train.Saver() if perplexity < best_perplexity: best_perplexity = perplexity print "New Best Perplexity: " + str(perplexity) saver.save( sess, "./code/trainer/models/" + r.lower() + "/single_epoch_" + str(epoch + 1) + "_" + str(args.seqlength) + "_" + str(args.peephole) + ".ckpt") with open('./code/trainer/diag/diagnostics_new_final.csv', 'a') as diag_out: csv_diag_out = csv.writer(diag_out) csv_diag_out.writerow([ args.subreddit, str(config_file.peephole), str(best_perplexity), str(config_file.drop_out), str(config_file.hidden_unit_size), str(config_file.learning_rate), str(config_file.embed_size), str(config_file.sequence_length) ])
def train(args): n_epochs = 100 embeddings = get_embeddings( embed_path='./data/new_embeddings_final_filtered.pkl') # embeddings = np.load('./data/final_large_weights.npy') # embeddings = np.vstack([embeddings, np.zeros(embeddings.shape[1])]) all_dat = collections.defaultdict(list) raw_data = get_data(path='./data/2015_data_tokenzed.pkl') for r, post in raw_data: all_dat[r].append(post) # vocabs = collections.defaultdict(str) # with open('./data/large_vocab') as csvfile: # vocab = csv.reader(csvfile) # for v in vocab: # vocabs[v[1]] = v[0] #get vocab: with open('./data/large_vocab_final_filtered.pkl', 'rb') as f: vocabs = cPickle.load(f) f.close() vocabs = collections.defaultdict(str, vocabs) def get_indices(sent): return [vocabs[i] for i in sent] vocabs_reversed = {v: k for k, v in vocabs.iteritems()} def get_words(sent): return [vocabs_reversed[i] for i in sent] r = args.subreddit sample = np.array([get_indices(j) for j in all_dat[r]]) # subsample_y = [get_indices(j) for j[1:] in all_dat['personalfinance']][0:100] max_length = max(len(i) for i in sample) #seq_length, max_length, embed_size, output_size config_file = Config(drop_out=args.dropout, max_length=max_length, embed_size=embeddings.shape[1], output_size=embeddings.shape[0], batch_size=256, learning_rate=args.learningrate, hidden_unit_size=args.hiddensize, num_layers=args.numlayers, sequence_length=args.seqlength, peepholes=args.peephole) idx = np.arange(len(sample)) train_inds, dev_inds, test_inds = get_dev_test_sets( dev_size=config_file.dev_set_size, test_size=config_file.test_set_size, training_indices=idx) train, dev, test = sample[train_inds], sample[dev_inds], sample[test_inds] with tf.Graph().as_default(): m = RNN_LSTM(embeddings=embeddings, config=config_file) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) # loss = m.test_session(sess, train) best_perplexity = np.inf for epoch in range(n_epochs): print "Epoch: " + str(epoch + 1) m.run_epoch(sess, np.array(train)) # # #evaluate test perplexity test_size = len(dev) total_perplexity = 0 total_batches = 0 for k, indices in enumerate(get_batch(test_size, 100)): total_batches += 1 test_batch = dev[indices] max_len = max(len(case) for case in test_batch) padded = generate_padded_seq(max_len, config_file.output_size, test_batch) masks = np.matrix(get_masks(test_batch, max_len)) batch_x = [i[:-1] for i in padded] batch_y = [i[1:] for i in padded] batch_x_mat = np.matrix(batch_x) batch_y_mat = np.matrix(batch_y) # batch_perplexity = 0 batch_loss = 0. sequences = get_sequence( max_len, sequence_length=config_file.sequence_length) for bat in sequences: x = batch_x_mat[:, bat] y = batch_y_mat[:, bat] batch_mask = masks[:, bat] feed = m.create_feed_dict(inputs_batch=x, labels_batch=y, dropout=config_file.drop_out, mask_batch=batch_mask, seq_length=[1] * len(test_batch)) loss = sess.run(m.loss, feed_dict=feed) # perplexities = sess.run(m.error, feed_dict=feed) # print "Single word-pair perplexity: " + str(perplexities) batch_loss += loss batch_loss = batch_loss / len(sequences) batch_perplexity = batch_loss**2 total_perplexity += batch_perplexity print "Epoch " + str( epoch + 1) + " Total test perplexity for batch " + str( k + 1) + ' :' + str(batch_perplexity) if total_perplexity < best_perplexity: best_perplexity = total_perplexity print "New Best Perplexity: " + str(best_perplexity) saver.save( sess, "./code/trainer/models/" + r.lower() + "/single_epoch_" + str(epoch + 1) + ".ckpt") with open('./code/trainer/diag/diagnostics_new_final.csv', 'a') as diag_out: csv_diag_out = csv.writer(diag_out) csv_diag_out.writerow([ args.subreddit, str(config_file.peephole), str(best_perplexity), str(config_file.drop_out), str(config_file.hidden_unit_size), str(config_file.learning_rate), str(config_file.embed_size), str(config_file.sequence_length) ])