def createName(self, title, currentPage, maxPage, extension=".png"): page = utility.getZeroFillNumberString(currentPage, maxPage) title = re.sub(" ", "", title) # return os.path.join(self.destination, f"{self.prefix}{title}_{page}{self.suffix}{extension}") name = os.path.join(self.destination, title, f"{self.prefix}{page}{self.suffix}{extension}") utility.makeDir(os.path.split(name)[0]) return name
def createLogDirectories(log_dir): timestamp = int(time.time()) main_dir = os.path.join(log_dir, str(timestamp)) train_log_dir = os.path.join(main_dir,"train") checkpoint_dir = os.path.join(main_dir,"checkpoints") utility.makeDir(train_log_dir) utility.makeDir(checkpoint_dir) return {"main_dir": main_dir, "train_log_dir": train_log_dir, "checkpoint_dir": checkpoint_dir}
def generateLogDirectories(log_directory): summary_directory = os.path.join(log_directory,"summaries") train_log_directory = os.path.join(summary_directory,"train") valid_log_directory = os.path.join(summary_directory, "valid") train_model_directory = os.path.join(log_directory,"models") train_model_file = os.path.join(train_model_directory, 'checkpoint') utility.makeDir(train_log_directory) utility.makeDir(train_model_directory) utility.makeDir(valid_log_directory) return { 'summary_directory':summary_directory, 'train_log_directory':train_log_directory, 'valid_log_directory':valid_log_directory, 'train_model_directory':train_model_directory, 'train_model_file':train_model_file }
def executeTraining(train_dataset_merged, valid_dataset_merged, num_epochs, batch_size, vocabulary_size, embedding_size, num_labels, hidden_size, summary_frequency, embeddings_file, log_directory, num_checkpoints = 5): graph = tf.Graph() with graph.as_default(): tg = TrainingGraph(vocabulary_size, embedding_size, num_labels, hidden_size) precision_tf = tf.placeholder(shape=[], dtype=tf.float32,name='precision') recall_tf = tf.placeholder(shape=[], dtype=tf.float32,name='recall') f1_tf = tf.placeholder(shape=[], dtype=tf.float32,name='f1') train_batch = BatchGenerator(train_dataset_merged, batch_size, num_labels) valid_batch = BatchGenerator(valid_dataset_merged, len(valid_dataset_merged), num_labels) precision_summary = tf.summary.scalar('precision_summary',precision_tf) recall_summary = tf.summary.scalar('recall_summary',recall_tf) f1_summary = tf.summary.scalar('f1_summary',f1_tf) stat_summary = tf.summary.merge([precision_summary, recall_summary, f1_summary]) stat_dict={} summary_directory = os.path.join(log_directory,"summaries") train_log_directory = os.path.join(summary_directory,"train") valid_log_directory = os.path.join(summary_directory, "valid") train_model_directory = os.path.join(log_directory,"models") train_model_file = os.path.join(train_model_directory, 'checkpoint') utility.makeDir(train_log_directory) utility.makeDir(train_model_directory) utility.makeDir(valid_log_directory) train_summary_writer = tf.summary.FileWriter(train_log_directory, graph = graph) num_iters = (len(train_dataset_merged) // batch_size) * num_epochs feed_dict={} embeddings = utility.loadEmbeddings(embeddings_file) feed_dict[tg.embeddings] = embeddings print("Will take {} iters".format(num_iters)) # average loss of the system as whole overall_avg_loss = 0.0 with tf.Session(graph=graph) as session: session.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables(),max_to_keep = num_checkpoints) for i in range(num_iters): batch = train_batch.next_batch() feed_dict[tg.inp_x], lbls_batch = createInpOutListsFromBatch(batch) feed_dict[tg.labels] = np.transpose(lbls_batch,[1,0]) chk = False num_classifiers_to_test = len(tg.classifiers) # if 1 in feed_dict[tg.labels][0]: # chk=True # print(feed_dict[tg.labels]) # dec=[hotDecode(x) for x in feed_dict[tg.labels]] # print(dec) # lbls = np.array(feed_dict[tg.labels]) # print(lbls[:,dec[0][0]-1]) # break # break # store loss and predictions for each classifier. Loss is for general overall loss average # calculation of system as a whole, storing labels to calculate accuracy. classifier_ops = [] # loss across all classifiers. net_loss = 0.0 train_summary = None for j in range(num_classifiers_to_test): if j==num_classifiers_to_test-1: cl, prediction, _, train_summary = session.run([tg.classifiers[j].loss, tg.classifiers[j].prediction, tg.classifiers[j].optimizer, tg.all_summaries], feed_dict=feed_dict) else: cl, prediction, _ = session.run([tg.classifiers[j].loss, tg.classifiers[j].prediction, tg.classifiers[j].optimizer], feed_dict=feed_dict) # print(prediction) classifier_ops.append(prediction) # for x in range(len(prediction)): # print(prediction[i],"--",feed_dict[tg.labels][i]) # print(cl) # break net_loss+=cl # print(prediction) # average of loss across all classifiers # net_loss/=len(tg.classifiers) # print(net_loss) # classifier_ops[0][4] = [-1] # break overall_avg_loss+=net_loss # classifier_ops is num_classifiers x batch_size. # convert it to batch x num_classifiers classifier_ops = np.transpose(classifier_ops,[1,0,2]) # print(classifier_ops) # storing one hot vecs in diff var to get rid of third dimension while generating one hot vecs. # sort of concatenate for free at cost for memory pred_hot_vec = [] for j in range(len(classifier_ops)): pred_hot_vec.append(hotEncodeDistribution(classifier_ops[j])) # memory cleanup, bit agressive del classifier_ops train_summary_writer.add_summary(train_summary,i) # print(pred_hot_vec) # print("labels") # print(feed_dict[tg.labels]) # break f1, precision, recall = get_accuracy(pred_hot_vec, lbls_batch) stat_dict[precision_tf] = precision stat_dict[recall_tf] = recall stat_dict[f1_tf] = f1 pre, rec, ef1, ss = session.run([precision_tf, recall_tf, f1_tf, stat_summary],feed_dict = stat_dict) train_summary_writer.add_summary(ss,i) print("step {}/{}: loss: {}, f1:{}".format(i,num_iters,net_loss, f1)) if i%summary_frequency==0: save_loc = saver.save(session, train_model_file, global_step = i) print("Saving model at {}".format(save_loc)) print((len(pred_hot_vec),len(pred_hot_vec[0])), (len(lbls_batch),len(lbls_batch[0]))) for j in range(len(pred_hot_vec)): print(pred_hot_vec[j],"--",lbls_batch[j][:num_classifiers_to_test]) print()
parser = argparse.ArgumentParser() parser.add_argument("--embedding-file",help="embeddings txt file to read from", required=True) args = parser.parse_args() res = loadDataset("../data/nodes.csv","../data/groups.csv","../data/group-edges.csv") dataset = res['node2labels'] nodes=res['nodes'] labels = res['labels'] split_ratio = 0.75 random.shuffle(dataset) splitBorder = int(len(dataset)*split_ratio) train_dataset = dataset[:splitBorder] valid_dataset = dataset[splitBorder:] train_dataset_merged = collectNodesAndLabels(train_dataset) valid_dataset_merged = collectNodesAndLabels(valid_dataset) num_epochs = 1 batch_size = 5 hidden_size = 50 embedding_size = 128 summary_frequency = 10 # print(batch2string(train_batch.next_batch())) # t = TrainingGraph(len(nodes), 128, len(labels), 15) timestamp = int(time.time()) log_directory = os.path.join("classifier_runs",str(timestamp)) utility.makeDir(log_directory) write_metadata = os.path.join(log_directory,"metadata.txt") writeMeta(write_metadata, args.embedding_file, hidden_size) executeTraining(train_dataset_merged, valid_dataset_merged, num_epochs, batch_size, len(nodes), embedding_size, len(labels), hidden_size, summary_frequency, args.embedding_file, log_directory) # print(t.classifiers[0].loss)
def setDestination(self, entry, dest): self.destination = self.path if dest != "": self.destination = utility.makeDir(dest)
configfile = os.path.join(args.model_directory,'classify_config.txt') config_dict = ast.literal_eval(readFile(configfile)) config = utility.ConfigProvider() config.setDict(config_dict) embedding_size = config.getOption('embedding_size') model_directory = os.path.join(args.model_directory,'models') timestamp = int(time.time()) log_dir = os.path.join(args.log_dir,str(timestamp)) eval_log_dir = os.path.join(log_dir,'eval_summaries') nodeToLabel = None if args.eval_file!=None: nodeToLabel = readLabelsFile(args.eval_file, args.eval_file_delim) utility.makeDir(eval_log_dir) meta_file_path = os.path.join(log_dir,"meta.txt") classifier.writeMeta(meta_file_path,num_nodes = vocabulary_size, num_labels=num_labels,hidden_size=hidden_size, embedding_size=embedding_size,learned_from=args.model_directory,eval_file=args.eval_file) graph = tf.Graph() with graph.as_default(): tg = classifier.TrainingGraph(vocabulary_size, embedding_size, num_labels, hidden_size) f1_tf = tf.placeholder(shape=[],name='f1',dtype=tf.float32) prec_tf = tf.placeholder(shape=[],name='precision',dtype=tf.float32) rec_tf =tf.placeholder(shape=[],name='recall',dtype=tf.float32) avg_f1_tf = tf.placeholder(shape=[],name='avg_f1',dtype=tf.float32) avg_prec_tf = tf.placeholder(shape=[],name='avg_prec',dtype=tf.float32) avg_rec_tf = tf.placeholder(shape=[],name='avg_rec',dtype=tf.float32) f1_summary = tf.summary.scalar("f1_summary",f1_tf)
def createName(self, entry): number = utility.getZeroFillNumberString(self.createCount, self.num) name = os.path.join(self.destination, os.path.basename(entry.parent), f"{self.prefix}{os.path.basename(entry.parent)}_{number}{self.suffix}{entry.suffix}") utility.makeDir(os.path.split(name)[0]) self.createCount += 1 return name