def loadEmbeddings(filepath=DEFAULT_FILE_PATH, dimensions=50): ''' Read the embedding mapping. ''' count = 0 with open(filepath, "r") as fin: for line in fin: line = line.strip() if not line: continue count += 1 vprint(True, "Vocabulary size = " + str(count), color="BLUE") embeddings = np.zeros((count, dimensions)) tok2id = {} id = 0 with open(filepath) as ifs: for line in ifs: line = line.strip() if not line: continue row = line.split() token = row[0] tok2id[token] = id data = [float(x) for x in row[1:]] if len(data) != dimensions: print len(data) print dimensions raise RuntimeError("wrong number of dimensions") embeddings[id] = np.asarray(data) id += 1 # with open("dictionary.txt", "w") as fout: # for key in tok2id.keys(): # fout.write("%s %s\n" % (key, tok2id[key])) return embeddings, tok2id
def main(): ''' This function is used for unit testing this module. args.hidden_size (i.e. the size of hidden state) args.num_layers (default = 1, i.e. no stacking), args.input_seq_length, args.target_seq_length, args.input_embedding_size, args.output_vocab_size, args.batch_size (i.e. the number of sequences in each batch) args.optimizer_choice (defualt = "rms", also could be "adam", "grad_desc") args.learning_rate, args.grad_clip args.test args.verbose ''' parser = argparse.ArgumentParser() # RNN cell hidden state's size parser.add_argument('--hidden_size', type=int, default=96, help='size of RNN cell hidden state') # Number of stacked RNN layers. Only a single layer implemented parser.add_argument('--num_layers', type=int, default=1, help='number of stacked RNN layers') # Larger than the max length of each input sequence parser.add_argument('--input_seq_length', type=int, default=20, help='maximum length of each input sequence or larger') # Larger than the max of each target sequence parser.add_argument('--target_seq_length', type=int, default=20, help='maximum length of each target sequence or larger') # Embedding size of input parser.add_argument('--input_embedding_size', type=int, default=96, help='embedding size of input vectors') # Embedding size of output parser.add_argument('--output_vocab_size', type=int, default=92, help='size of output vocabulary') # Batch size parser.add_argument('--batch_size', type=int, default=100, help='number of sequences in a batch') # Choice of optimzier parser.add_argument('--optimizer_choice', type=str, default='rms', help='rms (defualt), adam, grad_desc') # Learning rate parser.add_argument('--learning_rate', type=float, default=0.002, help='Learning rate') # Gradient clip, i.e. maximum value of gradient amplitute allowed parser.add_argument('--grad_clip', type=float, default=None, help='gradient upbound, i.e. maximum value of gradient amplitute allowed') # Model unit testing flag, default to False parser.add_argument('-t','--test', action='store_true', help='only set to true when performing unit test') # Verbosity flag, default to False parser.add_argument('-v','--verbose', action='store_true', help='only set to true when you want verbosity') # Parse the arguments, and construct the model args = parser.parse_args() #args.test = True model = HierLSTMTransModel(args) print "arguments:" vprint(args.verbose, model.get_args().__dict__, color=None)
def __init__(self, batch_size=10, seq_lengths=[2, 3], token_sizes=[20, 1], usage="train", if_testing=False): ''' Initialization of a data loader. Params: batch_size: size of batch. seq_lengths: a integer list, [input_seq_length, target_seq_length] token_sizes: a integer list, [input_embedding_size, target_word_index_size], where target_word_index_size = 1 usage: what this data loader is used for: "train", "dev", test" if_testing: if True, returns random data. Returns: None ''' #train samples 84973 #dev samples 10614 #test samples 10617 self.batch_size = batch_size self.seq_lengths = seq_lengths self.complex_length = seq_lengths[0] self.simple_length = seq_lengths[1] self.token_sizes = token_sizes self.embedding_size = token_sizes[0] self.usage = usage self.if_testing = if_testing self.num_batches = int(10614 / batch_size) # Changed from 102696 % batch_size #self.if_first_batch = True vprint(True, "Loading embeddings...", color="BLUE") if self.usage == "train": #self.data_file = open('../data/train_data.txt') self.data_file = open('../data/dev_new_data.txt') self.data_file_path = '../data/dev_new_data.txt' elif self.usage == "test": #self.data_file = open('../data/test_data.txt') self.data_file = open('../data/dev_new_data.txt') #self.data_file_path = '../data/test_data.txt' self.data_file_path = '../data/dev_new_data.txt' if self.if_testing == True: # Testing code. Returns random numbers. Do not touch.. self.num_batches = 15 elif self.if_testing == False: # Real work self.embeddings, self.tok2id = loadEmbeddings( filepath=DEFAULT_FILE_PATH, dimensions=self.embedding_size) vprint(True, "Finished loading embeddings", color="BLUE")
def loadDict(filepath=DEFAULT_FILE_PATH, dimensions=50): count = 0 with open(filepath, "r") as fin: for line in fin: line = line.strip() if not line: continue count += 1 vprint(True, "Vocabulary size = " + str(count), color="BLUE") id2tok = {} id = 0 with open(filepath) as ifs: for line in ifs: line = line.strip() if not line: continue row = line.split() token = row[0] id2tok[id] = token id += 1 return id2tok
def __init__(self, args, training=True): ''' Initialization function for the class Model. Params: args: contains arguments required for the Model creation -- args.hidden_size (i.e. the size of hidden state) args.num_layers (default = 1, i.e. no stacking), args.input_seq_length, args.target_seq_length, args.input_embedding_size, args.output_vocab_size, args.target_token_size (=1, target token is target word's index) args.batch_size (i.e. the number of sequences in each batch), args.optimizer_choice (defualt = "rms", also could be "adam", "grad_desc"), args.learning_rate, args.grad_clip args.test args.verbose training: indicates whether this is a training session Returns: None NOTE Each cell's input is batch_size x 1 x input_embedding_size NOTE Each cell's output is batch_size x 1 x hidden_size (needs to be converted) ''' #if training == False: # args.batch_size = 2 # Store the arguments, and print the important argument values self.args = args verbose = self.args.verbose print("VanillaLSTMTransModel initializer is called..\n" \ + "Time: " + time.ctime() + "\n" \ + " args.hidden_size (H) = " + str(self.args.hidden_size) + "\n" \ + " args.input_embedding_size (Di) = " + str(self.args.input_embedding_size) + "\n" \ + " args.output_vocab_size (Vo) = " + str(self.args.output_vocab_size) + "\n" \ + " args.num_layers = " + str(self.args.num_layers) + "\n" \ + " args.optimizer_choice = " + self.args.optimizer_choice + "\n" \ + " args.learning_rate = " + str(self.args.learning_rate) + "\n" \ + " args.grad_clip = " + str(self.args.grad_clip) + "\n") if training: print("This is a training session..") print("Input batch size = " + str(self.args.batch_size) + "\n") else: print("This is a session other than training..") print("Input batch size = " + str(self.args.batch_size) + "\n") # initialize LSTM cell units, hidden_size is the dimension of hidden state # encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.args.hidden_size, state_is_tuple=True) # decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.args.hidden_size, state_is_tuple=True) encoder_cell = tf.nn.rnn_cell.LSTMCell( num_units=self.args.hidden_size, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True) decoder_cell = tf.nn.rnn_cell.LSTMCell( num_units=self.args.hidden_size, initializer=tf.contrib.layers.xavier_initializer(), state_is_tuple=True) # convert cell's outputs (batch_size x hidden_size for each cell) to batch_size x output_vocab_size # y_hat = softmax(tf.add(tf.matmul(cell_output, output_ws), output_bs)), output_bs = zeros, for now with tf.variable_scope("vanLSTM_decoder/decoder_accessory"): self.output_ws = tf.get_variable( "output_ws", [self.args.hidden_size, self.args.output_vocab_size]) output_affine_map_lambda = lambda cell_output_: tf.matmul( cell_output_, self.output_ws) output_converter_lambda = lambda cell_output_: tf.nn.softmax( logits=output_affine_map_lambda( cell_output_), dim=-1) # -1: last dimension self.output_affine_map_lambda = output_affine_map_lambda self.output_converter_lambda = output_converter_lambda # Multi-layer RNN ocnstruction, if more than one layer if self.args.num_layers <= 0 or isinstance(self.args.num_layers, int) == False: raise ValueError( "Specified number of layers is non-positive or is not an integer." ) elif self.args.num_layers >= 2: vprint( True, "Stacked RNN: number of layers = " + str(self.args.num_layers)) encoder_cell = tf.nn.rnn_cell.MultiRNNCell([encoder_cell] * self.args.num_layers, state_is_tuple=True) decoder_cell = tf.nn.rnn_cell.MultiRNNCell([decoder_cell] * self.args.num_layers, state_is_tuple=True) # TODO: (improve) Dropout layer can be added here # Store the recurrent unit self.encoder_cell = encoder_cell self.decoder_cell = decoder_cell # Create encoder and decoder RNNChain instances encoder = RNNChain(self.encoder_cell, name="vanLSTM_decoder", scope="vanLSTM_encoder") decoder = RNNChain(self.decoder_cell, name="vanLSTM_decoder", scope="vanLSTM_decoder") self.encoder = encoder self.decoder = decoder # Input data contains sequences of input tokens of input_embedding_size dimension self.input_data = tf.placeholder( tf.float32, [None, self.args.input_seq_length, self.args.input_embedding_size]) # Target data contains sequences of output tokens of target_token_size dimension (=1) self.target_data = tf.placeholder( tf.int32, [None, self.args.target_seq_length, self.args.target_token_size]) # Target lengths list contains numbers of non-padding input tokens in each sequence in this batch, # each element is an integer, indicating the number of non-padding tokens of a sequence. self.target_lens_list = tf.placeholder(tf.int32, [None]) # Learning rate self.lr = tf.Variable(self.args.learning_rate, trainable=False, name="learning_rate") # Initial cell state of LSTM (initialized with zeros) self.initial_state = encoder_cell.zero_state( batch_size=self.args.batch_size, dtype=tf.float32) # Preprocessing the information got from placeholders. # First, target_lens_list does not need any further actions. target_lens_list = self.target_lens_list # Second, input_data and target_data need reshaping. # Split inputs and targets according to sequences: a 3D Tensor, num_of_seq x seq_length x Di/Vo # -> list of size seq_length, each of whose element is of num_of_seq x 1 x Di/Vo if tf.__version__[0:2] == '0.': input_data_temp = tf.split(split_dim=1, num_split=self.args.input_seq_length, value=self.input_data) target_data_temp = tf.split(split_dim=1, num_split=self.args.target_seq_length, value=self.target_data) elif tf.__version__[0:2] == '1.': input_data_temp = tf.split(value=self.input_data, num_split=self.args.input_seq_length, split_dim=1) target_data_temp = tf.split(value=self.target_data, num_split=self.args.target_seq_length, split_dim=1) # Squeeze: list of size seq_length, each of which is num_of_seq x 1 x Di/Vo # -> list of size seq_length, each of which is num_of_seq x Di/Vo input_data_list = [ tf.squeeze(input=list_member, axis=[1]) for list_member in input_data_temp ] target_data_list = [ tf.squeeze(input=list_member, axis=[1]) for list_member in target_data_temp ] del input_data_temp, target_data_temp ## This is where the LSTM models differ from each other in substance. ## The other code might also differ but they are not substantial. # call the encoder #print("[DEBUG] self.initial_state: " + str(self.initial_state)) #with tf.variable_scope("vanLSTM_encoder"): vprint(True, "Building encoder...", color="MAG") encoder_start_time = time.time() _, self.encoder_final_state = encoder.run( inputs=input_data_list, chain_length=None, cell_input_size=[ self.args.batch_size, self.args.input_embedding_size ], initial_state=self.initial_state, feed_previous=False, verbose=self.args.verbose) self.encoder_end_state = self.initial_state encoder_end_time = time.time() vprint(True, " -- Encoder built. Time used: " + str(encoder_end_time - encoder_start_time) + " s", color="MAG") # call the decoder #with tf.variable_scope("vanLSTM_decoder"): #print("[DEBUG] self.encoder_final_state: " + str(self.encoder_final_state)) #print("[DEBUG] self.decoder_inital_state: " + str(self.decoder_cell.zero_state(batch_size=self.args.batch_size, dtype=tf.float32))) vprint(True, "Building decoder...", color="MAG") decoder_start_time = time.time() # cell_outputs is list of length target_seq_length, each element is batch_size x hidden_size self.cell_outputs, _ = decoder.run( inputs=input_data_list, chain_length=self.args.target_seq_length, cell_input_size=[ self.args.batch_size, self.args.output_vocab_size ], initial_state=self.encoder_final_state, feed_previous=True, loop_func=self.output_converter_lambda, verbose=self.args.verbose) decoder_end_time = time.time() vprint(True, " -- Decoder built. Time used: " + str(decoder_end_time - decoder_start_time) + " s", color="MAG") vprint(True, "Building output converter...", color="MAG") converter_start_time = time.time() # output_data is softmaxed. It is a list of length target_seq_length, each element is batch_size x output_vocab_size self.output_data = [ output_converter_lambda(cell_output_) for cell_output_ in self.cell_outputs ] converter_end_time = time.time() vprint(True, " -- Converter built. Time used: " + str(converter_end_time - converter_start_time) + " s", color="MAG") # Compute the cost scalar: specifically, the average cost per sequence vprint(True, "Building cost calculator...", color="MAG") sum_of_cost = self.get_sum_of_cost(cell_outputs=self.cell_outputs, targets=target_data_list, targets_lens=target_lens_list) #self.cost = tf.Variable(0.) self.cost = tf.div(sum_of_cost, self.args.batch_size) print("\n[DEBUG] self.cost: ") print self.cost # We only deal with back-propagration during training phase. if training == True: # Get trainable_variables list and count them. # Also clip the gradients if they are larger than self.args.grad_clip vprint(True, "\nAggregating all trainable variables...", color="BLUE") trainable_vars = tf.trainable_variables() num_trainable_components = 0 vprint(True, "\nNumber of trainable Tensors = " + str(len(trainable_vars)), color="GREEN") for i, var in enumerate(trainable_vars): num_trainable_components += np.product( trainable_vars[i].get_shape().as_list()) vprint(True, " " + str(trainable_vars[i].name) + \ "\t" + str(trainable_vars[i].get_shape()) + \ " x " + str(trainable_vars[i].dtype.name), color="GREEN") vprint(True, "Number of trainable scalar components = " + str(num_trainable_components), color="GREEN") if num_trainable_components >= 1e3 and num_trainable_components < 1e4: vprint(True, " -- that is in the order of 10e3: thousands\n", color="GREEN") elif num_trainable_components >= 1e4 and num_trainable_components < 1e5: vprint(True, " -- that is in the order of 10e4: tens of thousands\n", color="GREEN") elif num_trainable_components >= 1e5 and num_trainable_components < 1e6: vprint( True, " -- that is in the order of 10e5: hundreds of thousands\n", color="GREEN") elif num_trainable_components >= 1e6 and num_trainable_components < 1e7: vprint(True, " -- that is in the order of 10e6: millions\n", color="GREEN") elif num_trainable_components >= 1e7 and num_trainable_components < 1e8: vprint(True, " -- that is in the order of 10e7: tens of millions\n", color="GREEN") elif num_trainable_components >= 1e8 and num_trainable_components < 1e9: vprint( True, " -- that is in the order of 10e8: hundreds of millions\n", color="GREEN") elif num_trainable_components >= 1e9: vprint( True, " -- that is in the order of 10e9 to 10e-Infinity: billions or higher", color="GREEN") self.num_of_trainable_components = num_trainable_components # Compute the gradient of cost with respect of the trainable variables. vprint( True, "Calculating gradient expressions for all trainable variables. Be patient...", color="BLUE") grad_start_time = time.time() # self.gradients is a list of tuples of (grad_value, variable_name) self.gradients = tf.gradients(self.cost, trainable_vars) grad_end_time = time.time() vprint( True, " -- Finished calculating gradient expressions. Time used: " + str(grad_end_time - grad_start_time) + " s", color="BLUE") # A hack: when testing, elements in gradients may ALL be None, and it causes problems in clip_by_global_norm() # This is just for validation of the code. if self.args.test == True: print("TESTING TESTING TESTING") for i in xrange(len(self.gradients)): if self.gradients[i] == None: self.gradients[i] = tf.zeros( shape=trainable_vars[i].get_shape(), dtype=tf.float32) if self.args.grad_clip != None: clipped_grads, _ = tf.clip_by_global_norm( self.gradients, self.args.grad_clip) else: clipped_grads = self.gradients # Using RMSprop, inspired by the LSTM paper of Dr. Alahi, Prof. Saverese, and Prof. Fei-Fei Li if self.args.optimizer_choice == "rms": optimizer = tf.train.RMSPropOptimizer(self.lr) elif self.args.optimizer_choice == "adam": optimizer = tf.train.AdamOptimizer(self.lr) elif self.args.optimizer_choice == "grad_desc": optimizer = tf.train.GradientDescentOptimizer(self.lr) else: raise ValueError("Optimizer not supported: " + self.args.optimizer_choice) # Train operator. Apply gradients. If a gradient of a variable is None, it will be weeded out. self.train_op = optimizer.apply_gradients( zip(clipped_grads, trainable_vars))
def __init__(self, args, model_choice="V", if_testing=False): ''' Instantiate a model, a save file, and a log text file. Params: args: contains arguments required for the model creation. model_choice: specify the choice of model, default to "VanillaLSTMTransModel". V: VanillaLSTMTransModel H: HierLSTMTransModel, i.e. Hierarchical LSTM Model A/AH: AttenHierLSTMTransModel, i.e. Hierarchical LSTM Model with Attention Returns: None ''' # Save the args self.args = args self.if_testing = if_testing # Instantiate a model build_start_time = time.time() if args.continue_training == False: # First time training vprint(True, "Trainer is called. First time training.") vprint(True,"\033[1;m" + "Building computation graph for the model..." + "\033[0;m", color="CYAN") if model_choice == "VanillaLSTMTransModel" or model_choice == "V": model = VanillaLSTMTransModel(args) model_abbr = "V" elif model_chice == "AttenVanillaLSTMTransModel" or model_choice == "AV": model = AttenVanillaLSTMTransModel(args) model_abbr = "AV" elif model_choice == "HierLSTMTransModel" or model_choice == "H": model = HierLSTMTransModel(args) model_abbr = "H" elif model_choice + "AttenHierLSTMTransModel" or model_choice == "AH": model = AttenHierLSTMTransModel(args) model_abbr = "AH" else: raise ValueError("Model choice: " + str(model_choice) + " is not supported") self.model = model # Directory to save things if args.test == True: self.directory = "../RUN_" + model_abbr #self.directory = "../TestRUN_" + model_abbr + time.strftime("_%b%d_%H-%M-%S") else: self.directory = "../RUN_" + model_abbr os.mkdir(self.directory) else: # Continuing training if model_choice == "VanillaLSTMTransModel" or model_choice == "V": model_abbr = "V" elif model_choice == "HierLSTMTransModel" or model_choice == "H": model_abbr = "H" elif model_choice + "AttenHierLSTMTransModel" or model_choice == "AH" or model_choice == "A": model_abbr = "A" else: raise ValueError("Model choice: " + str(model_choice) + " is not supported") self.directory = "../RUN_" + model_abbr vprint(True, "Trainer is called. Continuing training.") try: with open(os.path.join(self.directory, 'args.pkl'), 'r+') as f: saved_args = pickle.load(f) self.args = saved_args # Don't forget this line below self.args.continue_training = True except: raise ValueError("The specified model is either not trained, damaged,\ or in a wrong path. It should be ../RUN_" + model_abbr + "/args.pkl") vprint(True, "\033[1;m" + "Rebuilding computation graph for the model..." + "\033[0;m", color="CYAN") if model_choice == "VanillaLSTMTransModel" or model_choice == "V": model = VanillaLSTMTransModel(saved_args) elif model_choice == "HierLSTMTransModel" or model_choice == "H": model = HierLSTMTransModel(saved_args) elif model_choice + "AttenHierLSTMTransModel" or model_choice == "AH" or model_choice == "A": model = AttenHierLSTMTransModel(saved_args) self.model = model build_end_time = time.time() vprint(True,"\033[1;m" + "Graph built. Time used: " + str(build_end_time - build_start_time) + " seconds" + "\033[0;m", color="CYAN") # Create/open a save file to save things. if args.continue_training == False: with open(os.path.join(self.directory, 'args.pkl'), 'a') as f: pickle.dump(args, f) vprint(True, "Arguments saved to file: " + self.directory + "/args.pkl") else: # Continuing from previous traing, do not write the arguments again. pass log = open(os.path.join(self.directory, 'log.txt'), 'a') # append from EOF. Create file if not found. log.write("Log file: " + self.directory + '\n') log.close() reduced_log = open(os.path.join(self.directory, 'reduced_log.txt'), 'a') # append from EOF. Create file if not found. reduced_log.write("Log file: " + self.directory + '\n')
def train(self, num_epochs=100, save_every_batch=400): ''' Train the model. Params: num_epochs: number of epochs, defualt to 100 save_every_batch: period of saving, epoch * data_loader.get_num_batches() + batch_index, defualt to 400. NOTE in the current implementation, this argument is unused. I opted to save after each epoch. Returns: None ''' args = self.args decay_rate = 0.95 # You may modify it yourself. decay_rate in (0,1] log = open(os.path.join(self.directory, 'log.txt'), 'a') reduced_log = open(os.path.join(self.directory, 'reduced_log.txt'), 'a') data_loader = Dataloader(batch_size=args.batch_size, seq_lengths=[args.input_seq_length, args.target_seq_length], token_sizes=[args.input_embedding_size, args.target_token_size], if_testing=self.if_testing) num_batches = data_loader.get_num_batches() # Tic train_start_time = time.time() vprint(True, "") with tf.Session() as sess: if args.continue_training == False: # Initialize all varaibles in the computational graph # r0.11 or earlier: sess.run(tf.initialize_all_variables()) sess.run(tf.global_variables_initializer()) # Add all the variables to the registration list of variables to be saved saver = tf.train.Saver(tf.global_variables(), max_to_keep=50) else: # Access the checkpoint file ckpt = tf.train.get_checkpoint_state(checkpoint_dir=self.directory, latest_filename=None) saver = tf.train.Saver(tf.global_variables(), max_to_keep=50) saver.restore(sess=sess, save_path=ckpt.model_checkpoint_path) print ckpt.model_checkpoint_path train_loss = 0.0 # For each epoch for e in range(num_epochs): # Reset data loader so that it reads from the beginning. data_loader.reset() print args.continue_training vprint(args.continue_training, "\033[1;mContinued training\033[0;m", color="MAG") vprint(True, "\033[1;mStepped in epoch e = " + str(e+1) + "\033[0;m", color="MAG") # Assign the learning rate (decayed acceleration to the epoch number) sess.run(tf.assign(self.model.lr, args.learning_rate * (decay_rate ** e))) #Get the initial state of the encoder state = sess.run(self.model.initial_state) # For each batch in this epoch for b in range(num_batches): vprint(True, "Stepped in epoch = " + str(e+1) + ", batch b = " + str(b+1), color="MAG") # Tic batch_start_time = time.time() # Get the input (x) and target (y) data of the current batch vprint(True, "Getting batch.. b = " + str(b+1), color="MAG") # x: input batch. It is a list of length batch_size, each element of which is of size input_seq_length x input_embedding_size # y: target batch. It is a list of length batch_size, each element of which is of size target_seq_length x target_token_size (=1) # yl: target sequences' lengths. It is a list of length batch_size, each element of which is an integer. x, y, yl = data_loader.next_batch() vprint(True, "Got batch. Run the session...", color="MAG") # Feed the input and target data and the initial cell state feed = {self.model.input_data: x, self.model.target_data: y, self.model.target_lens_list: yl, self.model.initial_state: state} # Fetch the loss of the self.model on this batch # output_data is softmaxed. It is a list of length target_seq_length, each element is batch_size x output_vocab_size try: _, train_loss = sess.run([self.model.train_op, self.model.cost], feed_dict=feed) #print output_data[0] except Exception as exception_msg: vprint(True, "sess.run() runtime error.", color="RED") print exception_msg # Toc batch_end_time = time.time() # Print something and write to log log_entry = "epoch {}/{}, global step number {}/{}, \n\ train_loss = {:.5f}, \n\ time/batch = {:.3f} s \n".format(e + 1, num_epochs, e * num_batches + b + 1, num_epochs * num_batches, train_loss, batch_end_time - batch_start_time) reduced_log_entry = "{} {} {} {} {:.5f}\n".format(e + 1, num_epochs, e * num_batches + b + 1, num_epochs * num_batches, train_loss) # Print on screen vprint(True, log_entry, color=None) # Append to log.txt and reduced_log.text. log.write(log_entry) reduced_log.write(reduced_log_entry) # Save the model after each epoch checkpoint_path = os.path.join(self.directory, 'model.ckpt') time_stamp_integer = int(time.time()) saver.save(sess, checkpoint_path, global_step=time_stamp_integer) print("Saved to {}".format(checkpoint_path + "-" + str(time_stamp_integer))) log.write("Saved to {}".format(checkpoint_path + "-" + str(time_stamp_integer))) train_end_time = time.time() vprint(True, "\033[1;m" + "\nTraining finished. Time used: " + str(train_end_time - train_start_time) + " seconds" + "\033[0;m", color="CYAN") log.write("Training finished.\n") log.close() reduced_log.close()
def run(self, inputs, chain_length, initial_state, cell_input_size=None, feed_previous=False, loop_func=None, verbose=False): ''' RNN segment works. Params: inputs: list of Tensors, variable length, each element is of size batch_size x input_vector_size. If feed_previous == True, then inputs does nothing. inputs should not be None, because it has other delicate uses later in this code. chain_length: length of this RNN chain. If feed_previous=True, then chain_length does nothing (can be None) because the length of chain is determined by inputs' length; if False, then chain_length determines the length of this RNN chain. initial_state: the initial state, of size batch_size x hidden_size cell_input_size: the size of each cell's input, can be either None, or a 2-integer list [batch_size, input_vector_size]. If cell_input_size is None or feed_previous=False, then cell_input_size assumes the value of the size of the input acquired by the first cell, regardless of what cell_input_size is. feed_previous: if True, then a cell's input is the previous cell's output processed by the loop affine function, except the first cell, whose input is specified in code with size of cell_input_size (i.e. [batch_size, input_vector_size] 2-integer list). loop_func: a lambda function that converts a cell's output from batch_size x hidden_size to batch_size x output_vocab_size verbose: verbosity flag Returns: outputs: list of tensors, length equals to the inputs, each element is of size batch_size x hidden_size. NOTE NOT converted to yhat cell_state: the cell state in the end of this cell segment ''' if inputs == None: raise ValueError("RNNChain::run()'s inputs should not be None") if feed_previous == True and loop_func == None: raise ValueError( "feed_previous is True, but loop_func is not given") if feed_previous == True: # This is a hack. Reassign inputs. inputs = range(chain_length) cell = self.cell scope = self.scope state = initial_state outputs = [] # if verbose: # print "\n\033[32m[INFO] an rnn_segement.run() is linked into the computational graph in scope " + scope + "\033[m" # print "\n[INFO] an rnn_segement.run() is linked into the computational graph in scope " + scope vprint( verbose, "\n[INFO] an rnn_segement.run() is linked into the computational graph in scope " + scope, color="g") with tf.variable_scope(scope): cell_state = initial_state vprint(verbose, self.get_info_str(cell_state)) outputs = [] if feed_previous == True: if cell_input_size == None or isinstance( cell_input_size, list) == False or len(list(cell_input_size)) != 2: raise ValueError( "cell_input_size should be a two-integer list, [batch_size, input_vector_size]" ) # TODO: (improve) assume the input to the first cell is a tensor of zero, for now prev_cell_output_yhat = tf.zeros(list(cell_input_size)) for i, cell_input in enumerate(inputs): if (feed_previous == True) and (prev_cell_output_yhat != None): # in this case, ignore cell_input's value and reassign it cell_input = prev_cell_output_yhat if i > 0: # if the cell is reused once, declare reusing since the second time you use it tf.get_variable_scope().reuse_variables() vprint(verbose, "\n[INFO] BEFORE CELL " + scope + "\n" + str(i) + " cell_input: Tensor " + str(cell_input.get_shape().as_list()), color="b") vprint(verbose, self.get_info_str(cell_state), color="b") # cell_output: batch_size x hidden_size, cell_state's dimension depends on cell's type cell_output, cell_state = cell(cell_input, cell_state) vprint(verbose, "[INFO] AFTER CELL " + scope + "\n" + str(i) + " cell_output: Tensor " + str(cell_output.get_shape().as_list()), color="cyan") vprint(verbose, self.get_info_str(cell_state), color="cyan") # append the cell's output to the output sequence outputs.append(cell_output) if feed_previous == True: prev_cell_output_yhat = loop_func(cell_output) vprint(verbose, "\n[INFO] PREV_CELL_OUTPUT " + scope + "\n" + str(i) + " prev_cell_output_yhat: " + str(prev_cell_output_yhat.get_shape().as_list()), color="YELLOW") # the last cell's state is the final state final_state = cell_state # NOTE "outputs" are the cells' immediate outputs, not converted to yhat. self.outputs = outputs self.final_state = final_state return outputs, final_state
'-v', '--verbose', action='store_true', help='only set to true when you want verbosity') # Do NOT Touch # Continuing training flag, default to False parser.add_argument( '-c', '--continue_training', action='store_true', help='if set, then continue training from the previous checkpoint' ) # Do NOT Touch # Parse the arguments, and construct the model args = parser.parse_args() # You may perform testing without actually loading data by calling: python run.py -t vprint(True, "run.py -- arg.test = " + str(args.test), color="CYAN") if args.test == True: # Test the code using artificial numbers, without actually loading the data. ############################## RNN Configuration ############################## args.hidden_size = 16 args.num_layers = 2 ################################## Data Info ################################## args.input_seq_length = 21 args.target_seq_length = 23 args.input_embedding_size = 16 args.output_vocab_size = 150 ############################### Trainer Settings ############################## args.epochs = 5 args.batch_size = 8 args.grad_clip = 15 args.learning_rate = 0.05
def __init__(self, args, training=True): ''' Initialization function for the class Model. Params: args: contains arguments required for the Model creation -- args.hidden_size (i.e. the size of hidden state) args.num_layers (default = 1, i.e. no stacking), args.input_seq_length, args.target_seq_length, args.input_embedding_size, args.output_vocab_size, args.target_token_size (=1, target token is target word's index) args.batch_size (i.e. the number of sequences in each batch), args.optimizer_choice (defualt = "rms", also could be "adam", "grad_desc"), args.learning_rate, args.grad_clip args.test args.verbose training: indicates whether this is a training session Returns: None NOTE Each cell's input is batch_size x 1 x input_embedding_size NOTE Each cell's output is batch_size x 1 x hidden_size (needs to be converted) ''' if training == False: args.batch_size = 1 # Store the arguments, and print the important argument values self.args = args verbose = self.args.verbose print("VanillaLSTMTransModel initializer is called..\n" \ + "Time: " + time.ctime() + "\n" \ + " args.hidden_size (H) = " + str(self.args.hidden_size) + "\n" \ + " args.input_embedding_size (Di) = " + str(self.args.input_embedding_size) + "\n" \ + " args.output_vocab_size (Vo) = " + str(self.args.output_vocab_size) + "\n" \ + " args.num_layers = " + str(self.args.num_layers) + "\n" \ + " args.optimizer_choice = " + self.args.optimizer_choice + "\n" \ + " args.learning_rate = " + str(self.args.learning_rate) + "\n" \ + " args.grad_clip = " + str(self.args.grad_clip) + "\n") if training: print("This is a training session..") print("Input batch size = " + str(self.args.batch_size) + "\n") else: print("This is a session other than training..") print("Input batch size = " + str(self.args.batch_size) + "\n") # initialize LSTM cell units, hidden_size is the dimension of hidden state word_encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.args.hidden_size, initializer=tf.contrib.layers.xavier_initializer(), sstate_is_tuple=True) sent_encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.args.hidden_size, initializer=tf.contrib.layers.xavier_initializer(), sstate_is_tuple=True) sent_decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.args.hidden_size, initializer=tf.contrib.layers.xavier_initializer(), sstate_is_tuple=True) word_decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.args.hidden_size, initializer=tf.contrib.layers.xavier_initializer(), sstate_is_tuple=True) # convert cell's outputs (batch_size x hidden_size for each cell) to batch_size x output_vocab_size # y_hat = softmax(tf.add(tf.matmul(cell_output, output_ws), output_bs)), output_bs = zeros, for now with tf.variable_scope("vanLSTM_decoder/decoder_accessory"): self.output_ws = tf.get_variable("output_ws", [self.args.hidden_size, self.args.output_vocab_size]) output_affine_map_lambda = lambda cell_output_: tf.matmul(cell_output_, self.output_ws) output_converter_lambda = lambda cell_output_: tf.nn.softmax(logits=output_affine_map_lambda(cell_output_), dim=-1) # -1: last self.output_affine_map_lambda = output_affine_map_lambda self.output_converter_lambda = output_converter_lambda # Multi-layer RNN ocnstruction, if more than one layer if self.args.num_layers <= 0 or isinstance(self.args.num_layers, int) == False: raise ValueError("Specified number of layers is non-positive or is not an integer.") elif self.args.num_layers >= 2: vprint(True, "Stacked RNN: number of layers = " + str(self.args.num_layers)) word_encoder_cell = tf.nn.rnn_cell.MultiRNNCell([word_encoder_cell] * self.args.num_layers, state_is_tuple=True) sent_encoder_cell = tf.nn.rnn_cell.MultiRNNCell([sent_encoder_cell] * self.args.num_layers, state_is_tuple=True) sent_decoder_cell = tf.nn.rnn_cell.MultiRNNCell([sent_decoder_cell] * self.args.num_layers, state_is_tuple=True) word_decoder_cell = tf.nn.rnn_cell.MultiRNNCell([word_decoder_cell] * self.args.num_layers, state_is_tuple=True) # TODO: (improve) Dropout layer can be added here # Store the recurrent unit self.word_encoder_cell = word_encoder_cell self.sent_encoder_cell = sent_encoder_cell self.sent_decoder_cell = sent_decoder_cell self.word_decoder_cell = word_decoder_cell # Create encoder and decoder RNNChain instances word_encoder = RNNChain(self.word_encoder_cell, name="hierLSTM_word_encoder", scope="hierLSTM_word_encoder") sent_encoder = RNNChain(self.sent_encoder_cell, name="hierLSTM_sent_encoder", scope="hierLSTM_sent_encoder") sent_decoder = RNNChain(self.sent_decoder_cell, name="hierLSTM_sent_decoder", scope="heirLSTM_sent_decoder") word_decoder = RNNChain(self.word_decoder_cell, name="hierLSTM_word_decoder", scope="hierLSTM_word_decoder") self.word_encoder = word_encoder self.sent_encoder = sent_encoder self.sent_decoder = sent_decoder self.word_decoder = word_decoder # Input data contains sequences of input tokens of input_embedding_size dimension self.input_data = tf.placeholder(tf.float32, [None, self.args.input_seq_length, self.args.input_embedding_size]) # Target data contains sequences of putput tokens of target_token_size dimension (=1) self.target_data = tf.placeholder(tf.int32, [None, self.args.target_seq_length, self.args.target_token_size]) # Target lengths list contains numbers of non-padding input tokens in each sequence in this batch, # each i-th element is a list of integers, indicating the number of non-padding tokens in each sentence, and the list's length indicating the number of non-padding sentences in this i-th sequence (which consists of one or more sentences). self.target_lens_list = tf.placeholder(tf.int32, [None, self.args.input_num_sent]) # Learning rate self.lr = tf.Variable(self.args.learning_rate, trainable=False, name="learning_rate") # Initial cell state of LSTM (initialized with zeros) # TODO: (improve) might use xavier initializer? self.initial_word_state = word_encoder_cell.zero_state(batch_size=self.args.batch_size, dtype=tf.float32) self.initial_sent_state = sent_encoder_cell.zero_state(batch_size=self.args.batch_size, dtype=tf.float32) # Preprocessing the information got from placeholders. # First, target_lens_list does not need any further actions. target_lens_list = self.target_lens_list
def __init__(self, args, model_choice="V", test_batch_size=1): ''' Instantiate a tester. Params: args: contains arguments required for the model rebuilding. model_choice: specify the choice of model, default to "VanillaLSTMTransModel". V: VanillaLSTMTransModel H: HierLSTMTransModel, i.e. Hierarchical LSTM Model A/AH: AttenHierLSTMTransModel, i.e. Hierarchical LSTM Model with Attention test_batch_size: the size of batch in testing (not necessarily the same as training batch size), default to 1. Returns: None. ''' self.args = args self.args.model_choice = model_choice self.test_batch_size = test_batch_size if model_choice == "VanillaLSTMTransModel" or model_choice == "V": model_abbr = "V" elif model_choice == "AttenVanillaLSTMTransModel" or model_choice == "AV": model_abbr = "AV" elif model_choice == "HierLSTMTransModel" or model_choice == "H": model_abbr = "H" elif model_choice == "AttenHierLSTMTransModel" or model_choice == "AH": model_abbr = "AH" else: raise ValueError("Model choice: " + str(model_choice) + " is not supported") self.directory = "../RUN_" + model_abbr try: with open(os.path.join(self.directory, 'args.pkl'), 'r+') as f: saved_args = pickle.load(f) saved_args.batch_size = self.test_batch_size except: raise ValueError( "This model is either not trained, damaged, or in a wrong path, which should be ../RUN_" + model_abbr + "args.pkl") # Instantiate a model with the saved args vprint(True, "\033[1;m" + "Testing. Rebuilding computation graph for the model..." + "\033[0;m", color="CYAN") if model_choice == "VanillaLSTMTransModel" or model_choice == "V": model = VanillaLSTMTransModel(saved_args, training=False) elif model_choice == "AttenVanillaLSTMTransModel" or model_choice == "AV": model = AttenVanillaLSTMTransModel(saved_args, training=False) elif model_choice == "HierLSTMTransModel" or model_choice == "H": model = HierLSTMTransModel(saved_args, training=False) elif model_choice + "AttenHierLSTMTransModel" or model_choice == "AH": model = AttenHierLSTMTransModel(saved_args, training=False) self.model = model # Instantiate a TensorFlow interactive session sess = tf.InteractiveSession() # Initiate a TensorFlow saver saver = tf.train.Saver(tf.global_variables()) # Get the checkpoint file to load the model ckpt = tf.train.get_checkpoint_state(checkpoint_dir=self.directory, latest_filename=None) # Load the model parameters into session vprint(True, "\033[1;m" + "Loading the model parameters..." + "\033[0;m", color="CYAN") saver.restore(sess, ckpt.model_checkpoint_path) # Link the session to Tester. self.sess = sess
def test(self, printout=False, if_testing=False): ''' Read in a sequence and output a sequence. Params: printout: if True, print the input, target, and output words. if_testing: if True, use random number. Returns: None ''' # For this testing session self.total_loss = 0.0 self.loss_on_each_seq = [] rf = open(os.path.join(self.directory, 'test_results.txt'), 'w') # Rewrite rf.write("Test log file: " + self.directory + '\n') rf.close() rtf = open(os.path.join(self.directory, 'test_true_label.txt'), 'w') # Rewrite # Acquire the model and session model = self.model sess = self.sess data_loader = Dataloader(batch_size=self.test_batch_size, seq_lengths=[ self.args.input_seq_length, self.args.target_seq_length ], token_sizes=[ self.args.input_embedding_size, self.args.target_token_size ], usage="test", if_testing=self.args.test) # Reset the pointers in the data loader object data_loader.reset() #self.num_batches = data_loader.get_num_batches() self.num_batches = 1 rf = open(os.path.join(self.directory, 'test_results.txt'), 'a') # append from EOF. Create file if not found. lf = open(os.path.join(self.directory, 'test_log.txt'), 'a') # append from EOF. Create file if not found. for b in range(self.num_batches): test_start_time = time.time() # First, Make predictions vprint(True, "\nGetting batch.. b = " + str(b + 1), color="MAG") # x: input batch. It is a list of length test_batch_size, each element of which is of size input_seq_length x input_embedding_size # y: target batch. It is a list of length test_batch_size, each element of which is of size target_seq_length x target_token_size (=1) # yl: target sequences' lengths. It is a list of length test_batch_size, each element of which is an integer. x, y, yl = data_loader.next_batch() # Feed into the model and get out the prediction vprint(True, "Got batch. Making a batch of prediction...", color="MAG") #sess.run(tf.assign(self.model.lr, 0.0)) feed = { self.model.input_data: x, self.model.target_data: y, self.model.target_lens_list: yl } # output_data is softmaxed. It is a list of length target_seq_length, each element is test_batch_size x output_vocab_size test_loss, output_data = sess.run( [self.model.cost, self.model.output_data], feed_dict=feed) # print "target batch y:" # for k in xrange(self.test_batch_size): # print data_loader.data_file_path # print "-- the sequence # " + str(k+1) + " in this test batch" # print [list(yki) for yki in y[k]] # For k-th test batch, it is a sequence for k in xrange(self.test_batch_size): for i in xrange(len(y[k])): yki = int(y[k][i]) rtf.write(str([yki]) + " ") rtf.write("\n") print("test_loss = " + str(test_loss)) self.loss_on_each_seq.append(test_loss) self.total_loss += test_loss # Second, document the predictions # For k-th test batch for k in xrange(self.test_batch_size): # For i-th toekn position for i in xrange(len(output_data)): # If too long - I do not care what the output is beyond a certain length limit # Allow the output to exceed a little bit - maybe 1.2 times if i > yl[k] * 1.2: break # word_prob_b is the probability distribution over output_vocab_size. It is of size 1 x output_vocab_size word_prob = output_data[i][k] #print word_prob word_index = tf.argmax(word_prob, axis=0) word_index_singleton = [word_index.eval()] rf.write(str(word_index_singleton) + " ") rf.write("\n") rf.write("\n") test_end_time = time.time() vprint( True, "time/batch = {:.3f}".format(test_end_time - test_start_time) + " s", color=None) # NOTE that during testing, batch_size = 1. i.e. test the sequences one by one. # output_data is a list of length target_seq_length, each element of which is of size test_batch_size x output_vocab_size # For each token position i-th # for i in xrange(len(output_data)): # # Each output_data_item is NumPy ndarray, of size test_batch_size x output_vocab_size # output_data_item = output_data[i] # # Find the largest probability term as the prediction. # output_data_item_index = tf.argmax(output_data_item, axis=1) # # For each test batch 0-th in this token position # if i >= yl[0]: # break # rf.write(str(output_data_item_index.eval()) + " ") # rf.write("\n") rf.close() rtf.close() vprint( True, "Total loss of this model is " + str(self.total_loss / self.num_batches)) lf.write("Total loss: " + str(self.total_loss / self.num_batches) + "\n") lf.write("Loss on each sequence: \n") for i in xrange(len(self.loss_on_each_seq)): lf.write(str(self.loss_on_each_seq[i]) + "\n") lf.close()