def genflow(emb_path, emb_format, first_n): print_sleep_interval = 1 print("checkpoint 1") check_valid_file(emb_path) source_name = os.path.splitext(os.path.basename(emb_path))[0] print("Source name:", source_name) # take the first n most frequent word vectors for a subset # set to 0 to take entire embedding first_n = 0 # Preprocess. vectors_matrix,label_df = process_embedding(emb_path, emb_format, first_n, None) # We get the dimensions of the input dataset. shape = vectors_matrix.shape print("Shape of embedding matrix: ", shape) time.sleep(print_sleep_interval) sys.stdout.flush() # number of rows in the embedding num_inputs = shape[0] num_outputs = num_inputs # dimensionality of the embedding file num_hidden = shape[1] #=================================================================== now = datetime.datetime.now() timestamp = now.strftime("%Y-%m-%d-%H%M") # the name of the embedding to save parent = os.path.abspath(os.path.join(emb_path, "../")) check_valid_dir(parent) new_emb_path = str(os.path.join(parent, "random__source--" + source_name + "__" + timestamp + ".bin")) print("Writing to: ", new_emb_path) # RUN THE TRAINING PROCESS eval_process = mp.Process(name="eval", target=epoch, args=(vectors_matrix, label_df, new_emb_path)) eval_process.start() eval_process.join() return
def neighborflow(emb_path, model_path, batch_size, epochs, learning_rate, keep_prob, num_processes): print_sleep_interval = 1 model_index_path = model_path + ".index" retrain = True check_valid_file(emb_path) if os.path.isfile(model_index_path): print("There is already a model saved with this name. ") time.sleep(print_sleep_interval) sys.stdout.flush() retrain = False # take the first $n$ most frequent word vectors for a subset # set to 0 to take entire embedding first_n = 0 vectors_matrix, label_df = process_embedding(emb_path, first_n, None) # We get the dimensions of the input dataset. shape = vectors_matrix.shape print("Shape of embedding matrix: ", shape) time.sleep(print_sleep_interval) sys.stdout.flush() # number of rows in the embedding num_inputs = shape[0] num_outputs = num_inputs # dimensionality of the embedding file num_hidden = shape[1] print("Learning rate is: ", learning_rate) time.sleep(print_sleep_interval) sys.stdout.flush() # probability of outputting nonzero value in dropout layer. So the # input to the dropout layer goes to zero 1 - keep_prob of the time print("Dropout layer keep_prob is: ", keep_prob) time.sleep(print_sleep_interval) sys.stdout.flush() # HYPERPARAMETERS num_batches = num_inputs // batch_size # floor division print("Defining hyperparameters: ") time.sleep(print_sleep_interval) sys.stdout.flush() print("Epochs: ", epochs) time.sleep(print_sleep_interval) sys.stdout.flush() print("Batch size: ", batch_size) time.sleep(print_sleep_interval) sys.stdout.flush() print("Number of batches: ", num_batches) time.sleep(print_sleep_interval) sys.stdout.flush() # clears the default graph stack tf.reset_default_graph() # PLACEHOLDER # "tf.float32" just means the data type is an integer. The shape is # in the form [<columns>,<rows>], and "None" means it can be any # value. So this placeholder can have any number of rows, and must # have "num_inputs" columns. print("Initializing placeholder. ") time.sleep(print_sleep_interval) sys.stdout.flush() X = tf.placeholder(tf.float32, shape=[None, num_inputs]) # WEIGHTS print("Initializing weights. ") time.sleep(print_sleep_interval) sys.stdout.flush() # we use a variance scaling initializer so that it is capable of # adapting its scale to the shape of the weight tensors. initializer = tf.variance_scaling_initializer() input_weights = tf.Variable(initializer([num_inputs, num_hidden]), dtype=tf.float32) output_weights = tf.Variable(initializer([num_hidden, num_outputs]), dtype=tf.float32) # BIAS input_bias = tf.Variable(tf.zeros(num_hidden)) output_bias = tf.Variable(tf.zeros(num_outputs)) # ACTIVATION act_func = tf.nn.relu print("Initializing layers and defining loss function. ") time.sleep(print_sleep_interval) sys.stdout.flush() #=================================================================== # LAYERS # the argument of act_func is a Tensor, and the variable # "hidden_layer" itself is also a Tensor. This hidden layer is just # going to compute the element-wise relu hidden_layer = act_func(tf.matmul(X, input_weights) + input_bias) # With probability keep_prob, outputs the input element scaled up # by 1 / keep_prob, otherwise outputs 0. The scaling is so that the # expected sum is unchanged. dropout_layer = tf.nn.dropout(hidden_layer, keep_prob=keep_prob) output_layer = tf.matmul(dropout_layer, output_weights) + output_bias # We define our loss function, minimize MSE loss_vectors = tf.abs(output_layer - X) reduce_mean = tf.reduce_mean(X) loss = tf.reduce_mean(tf.abs(output_layer - X)) optimizer = tf.train.AdamOptimizer(learning_rate) train = optimizer.minimize(loss) init = tf.global_variables_initializer() saver = tf.train.Saver() # UNIT NORM THE EMBEDDING print("Unit norming the embedding. ") time.sleep(print_sleep_interval) sys.stdout.flush() norms_matrix = np.linalg.norm(vectors_matrix, axis=1) norms_matrix[norms_matrix == 0] = 1 vectors_matrix = vectors_matrix / np.expand_dims(norms_matrix, -1) print(vectors_matrix.shape) # we read the numpy array "vectors_matrix" into tf as a Tensor embedding_tensor = tf.constant(vectors_matrix) print("shape of emb_tens is: ", embedding_tensor.get_shape().as_list()) time.sleep(print_sleep_interval) sys.stdout.flush() embedding_unshuffled = embedding_tensor emb_transpose_unshuf = tf.transpose(embedding_unshuffled) emb_transpose_unshuf = tf.cast(emb_transpose_unshuf, tf.float32) emb_transpose = tf.transpose(embedding_tensor) emb_transpose = tf.cast(emb_transpose, tf.float32) #=================================================================== with open("loss_log_20K.txt", "a") as f: f.write("\n") f.write("=====================================================") f.write("\n") # this is where we'll add the dataset shuffler tf.random_shuffle(embedding_tensor) if retrain: for step in tqdm(range(epochs)): print("this is the ", step, "th epoch.") # we instantiate the queue seed_queue = mp.Queue() mananger = mp.Manager() batch_queue = mananger.Queue() # So we need each Process to take from an input queue, and # to output to an output queue. All 3 batch generation # prcoesses will read from the same input queue, and what # they will be reading is just an integer which corresponds # to an iteration for iteration in tqdm(range(num_batches)): seed_queue.put(iteration) # put in "p" halt seeds to tell the processes when to end for i in range(3): seed_queue.put(-1) new_emb_path = "" # CREATE MATRIXMULT PROCESSES batch_args = (embedding_tensor, emb_transpose, label_df, batch_size, seed_queue, batch_queue) print("About to start the batch processes. ") allprocs = [ mkproc(next_batch, batch_args) for x in range(num_processes) ] # RUN THE TRAINING PROCESS train_process = mp.Process( name="train", target=epoch, args=(embedding_tensor, num_batches, step, batch_queue, train, loss, loss_vectors, hidden_layer, X, init, saver, model_path, new_emb_path, retrain)) train_process.start() print("queue is full. ") # join the processes, i.e. end them for process in allprocs: process.terminate() # join the processes, i.e. end them for process in allprocs: process.join() print("batch generation functions joined. ") train_process.join() print("train joined. ") #=================================================================== # program hangs when I try to run from saved model ''' # Later, launch the model, use the saver to restore variables from # disk, and do some work with the model. with tf.Session() as sess: # Restore variables from disk. saver.restore(sess, model_path) print("Model restored.") # Check the values of the variables print(embedding_tensor.shape) # hidden_out = hidden_layer.eval(feed_dict={X: }) # for row in hidden_out: # print(row) ''' eval_batch_size = 100 # HYPERPARAMETERS eval_num_batches = num_inputs // eval_batch_size # floor division print("Defining hyperparameters: ") print("Eval batch size: ", eval_batch_size) print("Number of batches: ", eval_num_batches) # we instantiate the queue seed2_queue = mp.Queue() batch2_queue = mp.Queue() # So we need each Process to take from an input queue, and # to output to an output queue. All 3 batch generation # prcoesses will read from the same input queue, and what # they will be reading is just an integer which corresponds # to an iteration for iteration in tqdm(range(eval_num_batches)): seed2_queue.put(iteration) print("seed queue size: ", seed2_queue.qsize()) # CREATE MATRIXMULT PROCESSES batch_args = (embedding_unshuffled, emb_transpose_unshuf, label_df, eval_batch_size, seed2_queue, batch2_queue) print("About to start the batch processes. ") allprocs = [mkproc(next_batch, batch_args) for x in range(num_processes)] # the name of the embedding to save # something like "~/<path>/steve.txt" new_emb_path = "/homes/3/whitaker.213/eleven_embedding.txt" retrain = False # RUN THE TRAINING PROCESS eval_process = mp.Process(name="eval", target=epoch, args=(embedding_unshuffled, eval_num_batches, step, batch2_queue, train, loss, loss_vectors, hidden_layer, X, init, saver, model_path, new_emb_path, retrain)) eval_process.start() print("queue is full. ") # join the processes, i.e. end them for process in allprocs: process.terminate() # join the processes, i.e. end them for process in allprocs: process.join() eval_process.join() return
import sys import pyemblib from preprocessing import process_embedding if __name__ == "__main__": vecs, labels = process_embedding(sys.argv[1], pyemblib.Format.Word2Vec, 1000000, None) with open('labels.txt', 'a') as the_file: for label in labels: the_file.write(label + '\n')
def trainflow(emb_path, batch_size, epochs, learning_rate, keep_prob, num_processes): # Pandas behaves funny when batch_size of 1 is used. assert batch_size > 1 emb_format = pyemblib.Format.Word2Vec print_sleep_interval = 0.5 source_name = os.path.splitext(os.path.basename(emb_path))[0] print("Source name:", source_name) sys.stdout.flush() now = datetime.datetime.now() timestamp = now.strftime("%Y-%m-%d-%H%M") # The name of the embedding to save. parent = os.path.abspath(os.path.join(emb_path, "../")) check_valid_dir(parent) model_path = "../AE_models/" + source_name + ".ckpt" # take the first n most frequent word vectors for a subset # set to 0 to take entire embedding first_n = 0 model_index_path = model_path + ".index" new_emb_path = str( os.path.join( parent, "distAE-" + "__source--" + source_name + "__" + "time--" + timestamp + ".bin")) retrain = True check_valid_file(emb_path) if os.path.isfile(model_index_path): print("There is already a model saved with this name. ") time.sleep(print_sleep_interval) sys.stdout.flush() retrain = False # Take the first $n$ most frequent word vectors for a subset. # Set to 0 to take entire embedding. # Set size of distance vector target # (i.e. dimensionality of distance vectors). first_n = 10000 dist_target, useless_labels = process_embedding(emb_path, emb_format, first_n, None) vectors_matrix, label_df = process_embedding(emb_path, emb_format, 0, None) # We get the dimensions of the input dataset. shape = vectors_matrix.shape print("Shape of embedding matrix: ", shape) time.sleep(print_sleep_interval) sys.stdout.flush() # number of rows in the embedding num_inputs = shape[0] num_outputs = num_inputs # dimensionality of the embedding file num_hidden = shape[1] print("Learning rate is: ", learning_rate) time.sleep(print_sleep_interval) sys.stdout.flush() # probability of outputting nonzero value in dropout layer. So the # input to the dropout layer goes to zero 1 - keep_prob of the time print("Dropout layer keep_prob is: ", keep_prob) time.sleep(print_sleep_interval) sys.stdout.flush() # HYPERPARAMETERS num_batches = num_inputs // batch_size # floor division print("Defining hyperparameters: ") time.sleep(print_sleep_interval) sys.stdout.flush() print("Epochs: ", epochs) time.sleep(print_sleep_interval) sys.stdout.flush() print("Batch size: ", batch_size) time.sleep(print_sleep_interval) sys.stdout.flush() print("Number of batches: ", num_batches) time.sleep(print_sleep_interval) sys.stdout.flush() # clears the default graph stack tf.reset_default_graph() # PLACEHOLDER # "tf.float32" just means the data type is an integer. The shape is # in the form [<columns>,<rows>], and "None" means it can be any # value. So this placeholder can have any number of rows, and must # have "num_inputs" columns. print("Initializing placeholder. ") time.sleep(print_sleep_interval) sys.stdout.flush() # X = tf.placeholder(tf.float32, shape=[None, num_inputs]) ''' We used to have the above here, but we change the dimensionality of the distance vectors to first_n (10000, usually) so that we run things a bit faster. This reduces the target of our distance vector computation to pairwise with the first_n most frequent words. ''' X = tf.placeholder(tf.float32, shape=[None, first_n]) # WEIGHTS print("Initializing weights. ") time.sleep(print_sleep_interval) sys.stdout.flush() # we use a variance scaling initializer so that it is capable of # adapting its scale to the shape of the weight tensors. initializer = tf.variance_scaling_initializer() ''' input_weights = tf.Variable(initializer([num_inputs, num_hidden]), dtype=tf.float32) ''' input_weights = tf.Variable(initializer([first_n, num_hidden]), dtype=tf.float32) ''' output_weights = tf.Variable(initializer([num_hidden, num_outputs]), dtype=tf.float32) ''' output_weights = tf.Variable(initializer([num_hidden, first_n]), dtype=tf.float32) # BIAS input_bias = tf.Variable(tf.zeros(num_hidden)) #output_bias = tf.Variable(tf.zeros(num_outputs)) output_bias = tf.Variable(tf.zeros(first_n)) # ACTIVATION act_func = tf.nn.relu print("Initializing layers and defining loss function. ") time.sleep(print_sleep_interval) sys.stdout.flush() #=================================================================== # LAYERS # the argument of act_func is a Tensor, and the variable # "hidden_layer" itself is also a Tensor. This hidden layer is just # going to compute the element-wise relu hidden_layer = act_func(tf.matmul(X, input_weights) + input_bias) # With probability keep_prob, outputs the input element scaled up # by 1 / keep_prob, otherwise outputs 0. The scaling is so that the # expected sum is unchanged. dropout_layer = tf.nn.dropout(hidden_layer, keep_prob=keep_prob) output_layer = tf.matmul(dropout_layer, output_weights) + output_bias # We define our loss function, minimize MSE loss_vectors = tf.abs(output_layer - X) reduce_mean = tf.reduce_mean(X) loss = tf.reduce_mean(tf.abs(output_layer - X)) optimizer = tf.train.AdamOptimizer(learning_rate) train = optimizer.minimize(loss) init = tf.global_variables_initializer() saver = tf.train.Saver() # UNIT NORM THE EMBEDDING print("Unit norming the embedding. ") time.sleep(print_sleep_interval) sys.stdout.flush() norms_matrix = np.linalg.norm(vectors_matrix, axis=1) norms_matrix[norms_matrix == 0] = 1 vectors_matrix = vectors_matrix / np.expand_dims(norms_matrix, -1) print(vectors_matrix.shape) # we read the numpy array "vectors_matrix" into tf as a Tensor # embedding_tensor = tf.constant(vectors_matrix) dist_target_tensor = tf.constant(dist_target) # Not doing this anymore due to memory constraints. embedding_tensor = vectors_matrix print("shape of emb_tens is: ", embedding_tensor.shape) time.sleep(print_sleep_interval) sys.stdout.flush() embedding_unshuffled = np.copy(embedding_tensor) # emb_transpose_unshuf = np.transpose(embedding_unshuffled) # emb_transpose_unshuf = tf.cast(emb_transpose_unshuf, tf.float32) emb_transpose = tf.transpose(dist_target_tensor) emb_transpose = tf.cast(emb_transpose, tf.float32) #=================================================================== with open("./logs/loss_log_" + source_name + ".txt", "w") as f: f.write("\n") f.write("=====================================================") f.write("\n") # Dataset shuffler. np.random.shuffle(embedding_tensor) if retrain: for step in tqdm(range(epochs)): print("this is the ", step, "th epoch.") with open("./logs/loss_log_" + source_name + ".txt", "w") as f: f.write("\n") f.write( "=====================================================") f.write("\n") # we instantiate the queue seed_queue = mp.Queue() mananger = mp.Manager() batch_queue = mananger.Queue() # So we need each Process to take from an input queue, and # to output to an output queue. All 3 batch generation # prcoesses will read from the same input queue, and what # they will be reading is just an integer which corresponds # to an iteration for iteration in tqdm(range(num_batches)): seed_queue.put(iteration) # put in "p" halt seeds to tell the processes when to end for i in range(3): seed_queue.put(-1) # CREATE MATRIXMULT PROCESSES batch_args = (embedding_tensor, emb_transpose, label_df, batch_size, seed_queue, batch_queue) print("About to start the batch processes. ") allprocs = [ mkproc(next_batch, batch_args) for x in range(num_processes) ] # RUN THE TRAINING PROCESS train_process = mp.Process( name="train", target=epoch, args=(embedding_tensor, num_batches, step, batch_queue, train, loss, loss_vectors, hidden_layer, X, init, saver, model_path, new_emb_path, source_name, retrain)) train_process.start() print("queue is full. ") # join the processes, i.e. end them for process in allprocs: process.join() # join the processes, i.e. end them for process in allprocs: process.terminate() print("batch generation functions joined. ") train_process.join() print("train joined. ") #=================================================================== # THIS PORTION IS FOR SAVING THE RESULTANT EMBEDDING. #=================================================================== # NOTE: Program hangs when I try to run from saved model. ''' # Later, launch the model, use the saver to restore variables from # disk, and do some work with the model. with tf.Session() as sess: # Restore variables from disk. saver.restore(sess, model_path) print("Model restored.") # Check the values of the variables print(embedding_tensor.shape) # hidden_out = hidden_layer.eval(feed_dict={X: }) # for row in hidden_out: # print(row) ''' eval_batch_size = batch_size # HYPERPARAMETERS eval_num_batches = num_inputs // eval_batch_size # floor division print("Defining hyperparameters: ") print("Eval batch size: ", eval_batch_size) print("Number of batches: ", eval_num_batches) # we instantiate the queue seed2_queue = mp.Queue() batch2_queue = mp.Queue() # So we need each Process to take from an input queue, and # to output to an output queue. All 3 batch generation # prcoesses will read from the same input queue, and what # they will be reading is just an integer which corresponds # to an iteration for iteration in tqdm(range(eval_num_batches)): seed2_queue.put(iteration) # put in "p" halt seeds to tell the processes when to end for i in range(3): seed2_queue.put(-1) print("seed queue size: ", seed2_queue.qsize()) # CREATE MATRIXMULT PROCESSES batch_args = (embedding_unshuffled, emb_transpose, label_df, eval_batch_size, seed2_queue, batch2_queue) print("About to start the batch processes. ") allprocs = [mkproc(next_batch, batch_args) for x in range(num_processes)] # the name of the embedding to save # something like "~/<path>/steve.txt" # new_emb_path = "/homes/3/whitaker.213/eleven_embedding.txt" # Tells the program we want to save embedding vectors instead of # retrain model weights. retrain = False # First and only iteration. step = 0 # RUN THE TRAINING PROCESS eval_process = mp.Process( name="eval", target=epoch, args=(embedding_unshuffled, eval_num_batches, step, batch2_queue, train, loss, loss_vectors, hidden_layer, X, init, saver, model_path, new_emb_path, source_name, retrain)) eval_process.start() print("queue is full. ") # join the processes, i.e. end them for process in allprocs: process.join() # join the processes, i.e. end them for process in allprocs: process.terminate() eval_process.join() return
def genflow(emb_path, emb_format, first_n): print_sleep_interval = 1 print("checkpoint 1") check_valid_file(emb_path) sys.stdout.flush() source_name = os.path.splitext(os.path.basename(emb_path))[0] print("Source name:", source_name) sys.stdout.flush() # take the first n most frequent word vectors for a subset # set to 0 to take entire embedding first_n = 0 # Preprocess. print("About to preprocess. ") sys.stdout.flush() vectors_matrix, label_df = process_embedding(emb_path, emb_format, first_n, None) print("Done preprocessing. ") sys.stdout.flush() # We get the dimensions of the input dataset. shape = vectors_matrix.shape print("Shape of embedding matrix: ", shape) time.sleep(print_sleep_interval) sys.stdout.flush() # number of rows in the embedding num_inputs = shape[0] num_outputs = num_inputs # dimensionality of the embedding file dim = shape[1] #=================================================================== now = datetime.datetime.now() timestamp = now.strftime("%Y-%m-%d-%H%M") # The name of the embedding to save. parent = os.path.abspath(os.path.join(emb_path, "../")) check_valid_dir(parent) print("Is anything happening here?") sys.stdout.flush() transforms = get_config(dim) print("Got transforms. ") sys.stdout.flush() output_embedding_paths = [] for i, transform in tqdm(enumerate(transforms)): func = transform[0] arglist = transform[1] new_emb_path = str( os.path.join( parent, "affine-" + str(i) + "__source--" + source_name + "__" + "time--" + timestamp + ".bin")) sys.stdout.flush() output_embedding_paths.append(new_emb_path) print("About to start generation.") sys.stdout.flush() transformed_vectors = func(vectors_matrix, arglist) # shape [<num_inputs>,<dimensions>] print("labels shape: ", label_df.shape) sys.stdout.flush() # creates the emb dict dist_emb_dict = {} for i in tqdm(range(len(label_df))): emb_array_row = transformed_vectors[i] dist_emb_dict.update({label_df[i]: emb_array_row}) sys.stdout.flush() print("Embedding dict created. ") sys.stdout.flush() # saves the embedding pyemblib.write(dist_emb_dict, new_emb_path, mode=pyemblib.Mode.Binary) print("Embedding saved to: " + new_emb_path) # Write the output embedding names to a text file. outputlist_name = "affine-outputlist__source--" + source_name + "__time--" + timestamp + ".txt" outputlist_path = os.path.join(parent, outputlist_name) with open(outputlist_path, 'w') as f: for path in output_embedding_paths: f.write(path + "\n") return