def make_tensorboard(tf_graphdir="/tmp/artificial_hotel_reviews/a4_graph", V=100, H=1024, num_layers=2): reload(rnnlm) TF_GRAPHDIR = tf_graphdir # Clear old log directory. shutil.rmtree(TF_GRAPHDIR, ignore_errors=True) lm = rnnlm.RNNLM(V=V, H=H, num_layers=num_layers) lm.BuildCoreGraph() lm.BuildTrainGraph() lm.BuildSamplerGraph() summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph) return summary_writer
def setUp(self): sequence = ["a", "b", "c", "d"] self.vocab = vocabulary.Vocabulary(sequence) ids = self.vocab.words_to_ids(sequence) self.train_ids = np.array(ids * 10000, dtype=int) self.test_ids = np.array(ids * 100, dtype=int) model_params = dict(V=self.vocab.size, H=10, softmax_ns=2, num_layers=1) self.lm = rnnlm.RNNLM(**model_params) self.lm.BuildCoreGraph() self.lm.BuildTrainGraph() self.lm.BuildSamplerGraph() # For toy model, ignore sampled softmax. self.lm.train_loss_ = self.lm.loss_
def generate_text(trained_filename, model_params): # Same as above, but as a batch #max_steps = 20 max_steps = 50 num_samples = 10 random_seed = 42 lm = rnnlm.RNNLM(**model_params) lm.BuildCoreGraph() lm.BuildSamplerGraph() with lm.graph.as_default(): saver = tf.train.Saver() with tf.Session(graph=lm.graph) as session: # Seed RNG for repeatability tf.set_random_seed(random_seed) # Load the trained model saver.restore(session, trained_filename) # Make initial state for a batch with batch_size = num_samples #w = np.repeat([[vocab.START_ID]], num_samples, axis=0) [char_dict.get(token) for token in test_review_list] w = np.repeat([[char_dict.get('<SOR>')]], num_samples, axis=0) h = session.run(lm.initial_h_, {lm.input_w_: w}) # take one step for each sequence on each iteration for i in range(max_steps): h, y = sample_step(lm, session, w[:, -1:], h) w = np.hstack((w, y)) # Print generated sentences for row in w: for i, word_id in enumerate(row): #print(vocab.id_to_word[word_id], end=" ") print(ids_to_words[word_id], end="") #if (i != 0) and (word_id == vocab.START_ID): if (i != 0) and (word_id == char_dict.get("<EOR>")): break print("")
def loadAndPreprocessData(): ''' Read all the words to create a vocabulary ''' all_tokens = [] indir = '../preprocess/' for root, dirs, filenames in os.walk(indir): for filename in filenames: if filename.startswith('canonicalized_words_'): with open(indir + filename, 'r') as f: for line in f.readlines(): w = line.rstrip() if w != '': all_tokens.append(w) print 'Processed all tokens: ', len(all_tokens) tokens_dict = Counter() for w in all_tokens: if w.startswith('DG') and w.endswith('DG'): w = 'DG' tokens_dict[w] += 1 ''' Remove noisy tokens - see notebook for exploratory analysis The first ~2500 tokens when sorted by key are noisy like "!!!!" or "* * * *" - for eg, the end of a chapter ''' noisy_tokens = sorted(tokens_dict)[0:2507] print 'Identified noisy tokens - some examples: ', noisy_tokens[0:30] ''' Clean up the tokens now that we know the noisy tokens and then generate the vocab ''' noisy_tokens = set(noisy_tokens) words = [w for w in all_tokens if w not in noisy_tokens] # TODO: Should make V configurable V = 50000 vocab = vocabulary.Vocabulary((word for word in words), size=V) print 'Vocabulary created with size: ', vocab.size ''' Read in the sentences already parsed from the ~3000 books Gutenberg subset ''' sents = [] indir = '../preprocess/' books = [] for root, dirs, filenames in os.walk(indir): for filename in filenames: if filename.startswith('parsed_sents_'): with open(indir + filename, 'r') as f: for line in f.readlines(): sents.append(line.rstrip()) print 'Parsed sentences loaded into memory: ', len(sents) print 'The 10,000th sentence is: ', sents[10000] ''' Prepare training and test sentences ''' split = 0.8 shuffle = True sentences = np.array(sents, dtype=object) fmt = (len(sentences), sum(map(len, sentences))) print "Loaded %d sentences (%g tokens)" % fmt if shuffle: rng = np.random.RandomState(shuffle) rng.shuffle(sentences) # in-place train_frac = 0.8 split_idx = int(train_frac * len(sentences)) train_sentences = sentences[:split_idx] test_sentences = sentences[split_idx:] fmt = (len(train_sentences), sum(map(len, train_sentences))) print "Training set: %d sentences (%d tokens)" % fmt fmt = (len(test_sentences), sum(map(len, test_sentences))) print "Test set: %d sentences (%d tokens)" % fmt ''' Apply the vocab to the train and test sentences and convert words to ids to start training ''' ## Preprocess sentences ## convert words to ids based on the vocab wordset created above ## Do this in batches to avoid crashes due to insufficient memory batch_size = 50000 num_of_batches = int(round(len(train_sentences) / batch_size)) print 'Preprocessing train sentences - number of batches: ', num_of_batches train_id_batches = [] start = 0 end = start + batch_size for i in range(num_of_batches): if i % 15 is 0: print 'Completed Batches: ', i train_id_batches.append( utils.preprocess_sentences(train_sentences[start:end], vocab)) start = end end += batch_size # flatten the lists for 1D tensor temp = utils.flatten(train_id_batches) train_ids = utils.flatten(temp) print 'Train sentences converted to their IDs including start, end token and unknown word token' # repeat the same with test data batch_size = 50000 num_of_batches = int(round(len(test_sentences) / batch_size)) if num_of_batches > 10: num_of_batches = 10 print 'Preprocessing test sentences - number of batches: ', num_of_batches test_id_batches = [] start = 0 end = start + batch_size for i in range(num_of_batches): print 'Batch: ', i test_id_batches.append( utils.preprocess_sentences(test_sentences[start:end], vocab)) start = end end += batch_size test_ids = utils.flatten(utils.flatten(test_id_batches)) print 'Test sentences converted to their IDs including start, end token and unknown word token' max_time = 40 batch_size = 64 learning_rate = 0.01 num_epochs = 3 # Model parameters model_params = dict(V=vocab.size, H=100, softmax_ns=200, num_layers=1) TF_SAVEDIR = "tf_saved" checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm") trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained") # Will print status every this many seconds print_interval = 120 # Clear old log directory shutil.rmtree("tf_summaries", ignore_errors=True) lm = rnnlm.RNNLM(**model_params) lm.BuildCoreGraph() lm.BuildTrainGraph() # Explicitly add global initializer and variable saver to LM graph with lm.graph.as_default(): initializer = tf.global_variables_initializer() saver = tf.train.Saver() # Clear old log directory shutil.rmtree(TF_SAVEDIR, ignore_errors=True) if not os.path.isdir(TF_SAVEDIR): os.makedirs(TF_SAVEDIR) with tf.Session(graph=lm.graph) as session: # Seed RNG for repeatability tf.set_random_seed(42) session.run(initializer) for epoch in xrange(1, num_epochs + 1): t0_epoch = time.time() bi = utils.batch_generator(train_ids, batch_size, max_time) print "[epoch %d] Starting epoch %d" % (epoch, epoch) #### YOUR CODE HERE #### # Run a training epoch. run_epoch(lm, session, bi, train=True, learning_rate=learning_rate) #### END(YOUR CODE) #### print "[epoch %d] Completed in %s" % ( epoch, utils.pretty_timedelta(since=t0_epoch)) # Save a checkpoint saver.save(session, checkpoint_filename, global_step=epoch) ## # score_dataset will run a forward pass over the entire dataset # and report perplexity scores. This can be slow (around 1/2 to # 1/4 as long as a full epoch), so you may want to comment it out # to speed up training on a slow machine. Be sure to run it at the # end to evaluate your score. print("[epoch %d]" % epoch), score_dataset(lm, session, train_ids, name="Train set") print("[epoch %d]" % epoch), score_dataset(lm, session, test_ids, name="Test set") print "" # Save final model saver.save(session, trained_filename)
def setUp(self): model_params = dict(V=512, H=100, num_layers=1) self.lm = rnnlm.RNNLM(**model_params) self.lm.BuildCoreGraph() self.lm.BuildSamplerGraph()
lm.dropout_keep_prob_: 1.0, lm.learning_rate_: 0.1 }) #### END(YOUR CODE) #### # Note indexing here: # [batch_size, max_time, 1] -> [batch_size, 1] return final_h, samples[:, -1, :] # Same as above, but as a batch max_steps = 20 num_samples = 10 random_seed = 42 lm = rnnlm.RNNLM(**model_params) lm.BuildCoreGraph() lm.BuildSamplerGraph() with lm.graph.as_default(): saver = tf.train.Saver() with tf.Session(graph=lm.graph) as session: # Seed RNG for repeatability tf.set_random_seed(random_seed) # Load the trained model saver.restore(session, trained_filename) # Make initial state for a batch with batch_size = num_samples w = np.repeat([[vocab.START_ID]], num_samples, axis=0)
def run_training(train_ids, test_ids, max_time=100, batch_size=256, learning_rate=0.002, num_epochs=20, model_params, tf_savedir="/tmp/artificial_hotel_reviews/a4_model"): #V = len(words_to_ids.keys()) # Training parameters ## add parameter sets for each attack/defense configuration #max_time = 25 #batch_size = 100 #learning_rate = 0.01 #num_epochs = 10 # Model parameters #model_params = dict(V=vocab.size, #H=200, #softmax_ns=200, #num_layers=2) #model_params = dict(V=len(words_to_ids.keys()), #H=1024, #softmax_ns=len(words_to_ids.keys()), #num_layers=2) #model_params = dict(V=V, H=H, softmax_ns=softmax_ns, num_layers=num_layers) #TF_SAVEDIR = "/tmp/artificial_hotel_reviews/a4_model" TF_SAVEDIR = tf_savedir checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm") trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained") # Will print status every this many seconds #print_interval = 5 print_interval = 30 lm = rnnlm.RNNLM(**model_params) lm.BuildCoreGraph() lm.BuildTrainGraph() # Explicitly add global initializer and variable saver to LM graph with lm.graph.as_default(): initializer = tf.global_variables_initializer() saver = tf.train.Saver() # Clear old log directory shutil.rmtree(TF_SAVEDIR, ignore_errors=True) if not os.path.isdir(TF_SAVEDIR): os.makedirs(TF_SAVEDIR) with tf.Session(graph=lm.graph) as session: # Seed RNG for repeatability tf.set_random_seed(42) session.run(initializer) #check trainable variables #variables_names = [v.name for v in tf.trainable_variables()] #values = session.run(variables_names) #for k, v in zip(variables_names, values): #print("Variable: ", k) #print("Shape: ", v.shape) #print(v) for epoch in range(1, num_epochs + 1): t0_epoch = time.time() bi = utils.rnnlm_batch_generator(train_ids, batch_size, max_time) print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch)) # Run a training epoch. run_epoch(lm, session, batch_iterator=bi, train=True, verbose=True, tick_s=10, learning_rate=learning_rate) print("[epoch {:d}] Completed in {:s}".format( epoch, utils.pretty_timedelta(since=t0_epoch))) # Save a checkpoint saver.save(session, checkpoint_filename, global_step=epoch) ## # score_dataset will run a forward pass over the entire dataset # and report perplexity scores. This can be slow (around 1/2 to # 1/4 as long as a full epoch), so you may want to comment it out # to speed up training on a slow machine. Be sure to run it at the # end to evaluate your score. #print("[epoch {:d}]".format(epoch), end=" ") #score_dataset(lm, session, train_ids, name="Train set") print("[epoch {:d}]".format(epoch), end=" ") score_dataset(lm, session, test_ids, name="Test set") print("") # Save final model saver.save(session, trained_filename) return trained_filename