def main(): opt = parse_option() train_loader, n_data = set_loader(opt) model = set_model(opt, train_loader, n_data) optimizer, lr_scheduler = set_optimizer(opt, model, len(train_loader)) writer = SummaryWriter(logdir=opt.tb_folder) for epoch in range(1, opt.epochs + 1): end = time.time() loss_byol = train(train_loader, model, optimizer, lr_scheduler, epoch, opt) print('epoch {}, total time {:.2f}s'.format(epoch, time.time() - end)) writer.add_scalar('train loss', loss_byol, epoch) writer.add_scalar('learning_rate', lr_scheduler.get_lr(), epoch) if epoch % opt.save_freq == 0: save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) save_model(model.model, optimizer, opt, epoch, save_file) # save the last model save_file = os.path.join(opt.save_folder, 'last.pth') save_model(model.model, optimizer, opt, opt.epochs, save_file)
def get_we_matrix_wi_encode_docs_w_fasttext(ftext_model_path, docs_text_main_folder, encoded_out_folder_docs): f = load_model(ftext_model_path) text_by_name = {} print('reading files in folder') for filename in tqdm(os.listdir(docs_text_main_folder)): fp = os.path.join(docs_text_main_folder, filename) if os.path.isfile(fp): text_by_name[filename.split(r'.')[0]] = ' '.join( open(fp, 'r').readlines()) stoplist = load_indri_stopwords() print('encoding collection') encoded_docs_by_name = {} wi = {} we_matrix = [] for dn, dt in tqdm(text_by_name.items()): tok_doc = util.tokenize(dt, stemming=False, stoplist=stoplist) encoded_doc = [] for tok in tok_doc: if tok not in wi.keys(): wv = f.get_word_vector(tok) wi[tok] = len(wi) we_matrix.append(wv) encoded_doc.append(wi[tok]) util.save_model(encoded_doc, os.path.join(encoded_out_folder_docs, dn)) encoded_docs_by_name[dn] = encoded_doc return encoded_docs_by_name, wi, we_matrix
def run_model(which_task, model_params, train_iter, valid_iter, test_iter, save=False, model_file=''): ''' Trains single model w/ given parameters and evaluates on test Takes: - string denoting which field is label ("response" or "product") - dict of model parameters - train iterable of batches - validation iterable of batches - test iterable of batches - filename for model state dict (in models subdirectory) - boolean to turn off saving model state dict ''' best_model, train_time = optimize_params(model_params, train_iter, valid_iter) # compute loss on test set test_loss = best_model.evaluate(test_iter, BATCH_SIZE) print("Loss of best model on testing set:", test_loss) # save state if save: optimized_dict = best_model.state_dict() try: util.save_model(optimized_dict, model_file) except Exception as e: print(e) return best_model, train_data, test_loss return best_model, train_time, test_loss
def postprocess(dataset, model, noise_type, noise_ratio, folders, y_test_noisy): log_dir = folders['logdir'] loss, acc = model.evaluate(dataset.x_test, dataset.y_test, verbose=0) print('loss:', loss, '- acc:', acc) # calculate similarity ofgiven confusion matrix and output confusion matrix pred = model.predict(dataset.x_test) pred_int = np.argmax(pred, axis=1) sim = 1 - distance.cosine(pred_int, y_test_noisy) print('Similarity is', sim) # plot confusion matrix plot_cm(model, dataset.x_test, dataset.y_test_int(), dataset.class_names, log_dir + '/cm.png', title='acc({}), similarity({})'.format(round(acc, 3), round(sim, 2))) # plot accuracies and losses for all models base_folder = folders['logbase_nr'] plot_overall(base_folder) # save variables np.save(log_dir + 'preds.npy', pred_int) save_model(model, log_dir + 'model/model')
def save_model_outputs(model, _dataset, model_path): npy_path = model_path + 'npy/' create_folders(npy_path, model_path + 'model/') model_soft = Model(model.input, model.get_layer('features').output) # save softmax predictions pred = model.predict(_dataset.x_train)[:, :_dataset.num_classes] pred_int = np.argmax(pred, axis=1) np.save(npy_path + 'train_preds.npy', pred) np.save(npy_path + 'train_preds_int.npy', pred_int) pred = model.predict(_dataset.x_test)[:, :_dataset.num_classes] pred_int = np.argmax(pred, axis=1) np.save(npy_path + 'test_preds.npy', pred) np.save(npy_path + 'test_preds_int.npy', pred_int) # save logits logits_train = model_soft.predict( _dataset.x_train)[:, :_dataset.num_classes] logits_test = model_soft.predict(_dataset.x_test)[:, :_dataset.num_classes] np.save(npy_path + 'train_logits.npy', logits_train) np.save(npy_path + 'test_logits.npy', logits_test) # save confusion matrices cm_train = plot_cm(model, _dataset.x_train, _dataset.y_train_int(), _dataset.class_names, model_path + 'train_cm.png') cm_test = plot_cm(model, _dataset.x_test, _dataset.y_test_int(), _dataset.class_names, model_path + 'test_cm.png') np.save(npy_path + 'train_cm.npy', cm_train) np.save(npy_path + 'test_cm.npy', cm_test) # save distance matrices plot_dm(model_soft, _dataset.x_train, _dataset.y_train_int(), _dataset.class_names, model_path + 'train_dm.png') plot_dm(model_soft, _dataset.x_test, _dataset.y_test_int(), _dataset.class_names, model_path + 'test_dm.png') # save model plot_model(model, model_path + 'model/model.png') save_model(model, model_path + 'model/model') K.clear_session()
def train(): images, labels = process_data('./data/train-images-idx3-ubyte', './data/train-labels-idx1-ubyte') train_set = Mnist(images, labels) # train_loader = DataLoader(train_set, batch_size=64, # shuffle=True, num_workers=8, pin_memory=True) train_loader = DataLoader(train_set, batch_size=64, shuffle=True) model = Convnet() optimizer = torch.optim.Adam(model.parameters(), lr=0.001) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5) aver = Averager() for epoch in range(1, 11): lr_scheduler.step() model.train() for i, batch in enumerate(train_loader, 1): # image, label = [_.cuda() for _ in batch] image, label = batch score = model(image) loss = F.cross_entropy(score, label.long()) acc = count_acc(score, label, aver) print('epoch %d batch %d acc: %f' % (epoch, i, acc)) optimizer.zero_grad() loss.backward() optimizer.step() print('epoch %d acc: %f' % (epoch, aver.item())) save_model(model, 'model-1')
def compute_inverted_index(coll_folder, stemming, output_file_path_ii): if not os.path.isfile(output_file_path_ii): print('computing inverted index') inverted_idx = {} sw = util.load_indri_stopwords() doc_n = 0 for filename in tqdm(os.listdir(coll_folder)): fp = os.path.join(coll_folder, filename) doc_id = filename.split(r'.')[0] if os.path.isfile(fp): doc_n += 1 d = util.tokenize(' '.join(open(fp, 'r').readlines()), stemming, stoplist=sw) set_w_in_doc = set(d) for w in set_w_in_doc: if w in inverted_idx.keys(): inverted_idx[w].append((doc_id, d.count(w))) else: inverted_idx[w] = [(doc_id, d.count(w))] util.save_model(inverted_idx, output_file_path_ii) else: inverted_idx = util.load_model(output_file_path_ii) return inverted_idx
def main(): model = None if config.model == 'Baseline': model = models.Baseline() if config.train: # Can train with existing weights (if config.restart = True, will use most recent by default) weights_path = None if not config.restart: weights_path = os.path.join( config.model_save_dir, get_recent_weights_path(config.model_save_dir)) model.build(weights_path) # train(model) simple_train(model) if config.save_model: save_model(model, config.model_save_path, config.model_weights_save_path, ext=".h5") else: # If we only care about predicting! # Make sure there are trained weights (most recent will be used by default) weights_path = os.path.join( config.model_save_dir, get_recent_weights_path(config.model_save_dir)) model.build(weights_path) # pred(model) simple_pred(model)
def train(starting_epoch, model, optimizer, scheduler, criterion, trainer, evaluator, ENV): for epoch in range(starting_epoch, config.epochs): logger.info("="*20 + "Training" + "="*20) # Train ENV['global_step'] = trainer.train(epoch, ENV['global_step'], model, optimizer, criterion) scheduler.step() # Eval logger.info("="*20 + "Eval" + "="*20) evaluator.eval(epoch, ENV['global_step'], model, torch.nn.CrossEntropyLoss()) payload = ('Eval Loss:%.4f\tEval acc: %.2f' % (evaluator.loss_meters.avg, evaluator.acc_meters.avg*100)) logger.info(payload) ENV['train_history'].append(trainer.acc_meters.avg*100) ENV['eval_history'].append(evaluator.acc_meters.avg*100) ENV['curren_acc'] = evaluator.acc_meters.avg*100 ENV['best_acc'] = max(ENV['curren_acc'], ENV['best_acc']) # Reset Stats trainer._reset_stats() evaluator._reset_stats() # Save Model target_model = model.module if args.data_parallel else model util.save_model(ENV=ENV, epoch=epoch, model=target_model, optimizer=optimizer, scheduler=scheduler, filename=checkpoint_path_file) logger.info('Model Saved at %s', checkpoint_path_file) return
def pre_compute_test_fd_pwe(qbn, dbn, w, run_to_rerank, max_q_len, max_d_len, fold, output_folder): # test_fd = [] test_fd_fp = os.path.join(output_folder, 'test_fd_' + str(fold)) if not os.path.isfile(test_fd_fp): d_t_rerank_by_query = compute_docs_to_rerank_by_query( run_to_rerank, qbn.keys()) for q_name in tqdm(qbn.keys()): if os.path.isfile(test_fd_fp + '_' + q_name): continue if q_name not in d_t_rerank_by_query.keys(): continue d_names = d_t_rerank_by_query[q_name] docs = [dbn[dn] for dn in d_names if dn in dbn.keys()] d_lengths = [len(d) for d in docs] # padded_d = [pad(d, padding_value, max_d_len) for d in docs] q = qbn[q_name] all_sim_m = [] d_batch = [] for d in docs: d_batch.append(d) if len(d_batch) == 16: all_sim_m.extend( parallel_compute_sim_matrices([q] * len(d_batch), d_batch, max_q_len, max_d_len, w)) d_batch = [] # test_fd.append(([len(qbn[q_name])] * len(docs), d_lengths, d_names, q_name, all_sim_m)) util.save_model(([len(qbn[q_name])] * len(docs), d_lengths, d_names, q_name, all_sim_m), test_fd_fp + '_' + q_name)
def read_collection(coll_main_folder, output_model_path, stemming, stoplist=None): if not os.path.isfile(output_model_path): if stoplist is None: stoplist = util.load_indri_stopwords() text_by_name = {} print('reading files in folder') pool = multiprocessing.Pool(8) fnames_list = os.listdir(coll_main_folder) doc_paths_list = [ os.path.join(coll_main_folder, filename) for filename in fnames_list ] print('processing collection') tokenized_docs = pool.starmap( util.tokenize, [(' '.join(open(fp, 'r').readlines()), stemming, stoplist) for fp in doc_paths_list]) for i in range(len(fnames_list)): text_by_name[fnames_list[i].split(r'.')[0]] = tokenized_docs[i] print('saving model') util.save_model(text_by_name, output_model_path) else: print('loading model: %s' % output_model_path) text_by_name = util.load_model(output_model_path) return text_by_name
def encode_collection(text_by_name_p, word_dict_path, encoded_out_folder): # word_dict_path = '/media/alberto/DATA/BaiduNetdiskDownload/data/word_dict.txt' text_by_name = {} print('reading files in folder') for filename in tqdm(os.listdir(text_by_name_p)): fp = os.path.join(text_by_name_p, filename) if os.path.isfile(fp): text_by_name[filename.split(r'.')[0]] = ' '.join( open(fp, 'r').readlines()) print('reading word2vec model') encoded_docs_by_name = {} wi = {} for line in tqdm(open(word_dict_path)): data = line.split() word = data[0].strip() wid = int(data[1].strip()) if word not in wi.keys(): wi[word] = wid sw = load_indri_stopwords() print('encoding data') for dn, dc in tqdm(text_by_name.items()): td = util.tokenize(dc, stemming=False, stoplist=sw) encoded_doc = [wi[w] for w in td if w in wi.keys()] util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn)) encoded_docs_by_name[dn] = encoded_doc return encoded_docs_by_name
def compute_docs_to_rerank(dbn, qbn, we, gt_path): print('computing relevant docs by query') rel_docs_by_query = du.get_rel_docs_by_qry(gt_path) print('computing document representations') dbn_means = {} for k, v in tqdm(dbn.items()): if len(v) == 0: mean = np.zeros(50) dbn_means[k] = mean else: mean = np.mean([we[w] for w in v], axis=0) dbn_means[k] = mean / np.linalg.norm(mean) print('computing queries representations') qbn_means = {} for k, v in tqdm(qbn.items()): if len(v) == 0: mean = np.zeros(50) dbn_means[k] = mean else: mean = np.mean([we[w] for w in v], axis=0) qbn_means[k] = mean / np.linalg.norm(mean) doc_names = list(dbn_means.keys()) doc_names = np.array(doc_names) print('computing rankings') sorted_d_names_by_query = {} incremental_n_rel_docs_by_query = {} for qn, q in tqdm(qbn_means.items()): if qn not in rel_docs_by_query.keys(): continue dists = [-1] * len(doc_names) for i in range(len(doc_names)): dn = doc_names[i] bonus = np.sum([10 for w in qbn[qn] if w in dbn[dn]]) dists[i] = np.dot(dbn_means[dn], q) + bonus sorted_indices = np.argsort(-np.array(dists)) sorted_dnames = doc_names[sorted_indices] sorted_d_names_by_query[qn] = sorted_dnames[0:8000] incremental_n_rel_docs_by_query[qn] = [] rel_cnt = 0 for i in range(len(sorted_dnames)): dn = sorted_dnames[i] if dn in rel_docs_by_query[qn]: rel_cnt += 1 incremental_n_rel_docs_by_query[qn].append(rel_cnt) util.save_model(sorted_d_names_by_query, 'sorted_d_names_by_query.model') merged_incremental_rel_cnt = np.zeros(len(doc_names)) # util.save_model(sorted_d_names_by_query, 'sorted_d_names_by_query_w_bonus.model') # util.save_model(merged_incremental_rel_cnt, 'merged_incremental_rel_cnt_w_bonus.model') print('preparing plot data') for q, cnts in tqdm(incremental_n_rel_docs_by_query.items()): for i in range(len(cnts)): merged_incremental_rel_cnt[i] += cnts[i] out = open('log.txt', 'w') for i in merged_incremental_rel_cnt: out.write(str(i) + '\n') out.close()
def main(): for epoch in range(args.n_epoch): print('\n\n-------------------------------------------') print('Epoch-{}'.format(epoch)) print('-------------------------------------------') model.train() train_iter = tqdm(enumerate(dataset.train_iter())) train_iter.set_description_str('Training') for it, mb in train_iter: output = model(mb.context, mb.response) loss = F.binary_cross_entropy_with_logits(output, mb.label) loss.backward() # clip_gradient_threshold(model, -10, 10) solver.step() solver.zero_grad() if it > 0 and it % 1000 == 0: # Validation recall_at_ks = eval_model(model, dataset.valid_iter(), max_seq_len, max_seq_len, args.gpu) print( 'Loss: {:.3f}; recall@1: {:.3f}; recall@2: {:.3f}; recall@5: {:.3f}' .format(loss.data[0], recall_at_ks[0], recall_at_ks[1], recall_at_ks[4])) save_model(model, 'ccn_lstm')
def solve(self): cnf = self.cnf print cnf.initlambda self.ff.run_batches(self.inputs, self.targets, optimizer= HessianFree(CG_iter=250,init_damping=cnf.initlambda), batch_size=7500, test=self.test, max_epochs=cnf.max_epoch, plotting=True) self.logger.info("Optimization Done") util.save_model(self.cnf, self.ff)
def main(): best_acc = 0 opt = parse_option() # build data loader train_loader = set_loader(opt) _, val_loader = set_val_loader(opt) # build model and criterion model, classifier, criterions = set_model(opt) # build optimizer parameters = list(model.parameters()) + list(classifier.parameters()) optimizer = set_optimizer(opt, parameters) # tensorboard writer = SummaryWriter(log_dir=opt.tb_folder, flush_secs=2) # build memory banks memory_banks = { 'labeled': ReserviorMemory(opt.labeled_memory_capacity), 'unlabeled': ReserviorMemory(opt.unlabeled_memory_capacity) } # training routine for epoch in range(1, opt.epochs + 1): # train for one epoch time1 = time.time() loss = train(train_loader, memory_banks, model, classifier, criterions, optimizer, epoch, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # eval for one epoch if epoch % opt.val_freq == 0 or epoch == opt.epochs: loss, val_acc = validate(val_loader, model, classifier, criterions['CrossEntropyLoss'], opt) if val_acc > best_acc: best_acc = val_acc # tensorboard logger writer.add_scalar('loss', loss, global_step=epoch) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step=epoch) if epoch % opt.save_freq == 0: save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) save_model(model, optimizer, opt, epoch, save_file) print('best accuracy: {:.2f}'.format(best_acc)) # save the last model save_file = os.path.join(opt.save_folder, 'last.pth') save_model(model, optimizer, opt, opt.epochs, save_file)
def compute_training_pairs_w_queries_variations(fold_idx, coll, n_iter_per_query, gt_file, dbn, qbn, ftt_model, iwi, wi): model_name = 'qn_rd_nrd_pairs_w2v_gk_' + str(fold_idx) + '_' + str(coll) + '_' + str(n_iter_per_query) if not os.path.isfile(model_name): rd_b_qry = {} nrd_by_qry = {} for line in open(gt_file): data = line.split() qname = data[0].strip() dname = data[2].strip() if dname not in dbn.keys(): continue rj = int(data[3].strip()) if qname not in rd_b_qry.keys(): rd_b_qry[qname] = [] nrd_by_qry[qname] = [] if rj > 0: rd_b_qry[qname].append(dname) else: nrd_by_qry[qname].append(dname) test_q_names = list(qbn.keys()) np.random.shuffle(test_q_names) qn_rd_nrd_pairs = [] for qn in test_q_names: if qn not in rd_b_qry.keys(): continue # add training examples with original query: encoded_q = qbn[qn] tmp_rdocs = np.random.choice(rd_b_qry[qn], n_iter_per_query, replace=True) tmp_nrdocs = np.random.choice(nrd_by_qry[qn], n_iter_per_query, replace=True) for i in range(n_iter_per_query): qn_rd_nrd_pairs.append((encoded_q, dbn[tmp_rdocs[i]], dbn[tmp_nrdocs[i]])) print('original query: ' + ' '.join([iwi[w] for w in encoded_q])) # add extra training examples for i in range(len(encoded_q)): encoded_q_variation = encoded_q curr_q_word = iwi[encoded_q[i]] similar_words = get_synonyms(curr_q_word, ftt_model) for sw in similar_words: sw = util.stem(sw) if sw in wi.keys() and curr_q_word != sw: print('word = ' + curr_q_word + ', substitute = ' + sw) encoded_q_variation[i] = wi[sw] print('alternative query: ' + ' '.join([iwi[w] for w in encoded_q_variation])) tmp_rdocs = np.random.choice(rd_b_qry[qn], n_iter_per_query, replace=True) tmp_nrdocs = np.random.choice(nrd_by_qry[qn], n_iter_per_query, replace=True) for j in range(n_iter_per_query): qn_rd_nrd_pairs.append((encoded_q_variation, dbn[tmp_rdocs[j]], dbn[tmp_nrdocs[j]])) np.random.shuffle(qn_rd_nrd_pairs) util.save_model(qn_rd_nrd_pairs, model_name) else: qn_rd_nrd_pairs = util.load_model(model_name) return qn_rd_nrd_pairs
def main(): best_val = 0.0 for epoch in range(args.n_epoch): print('\n\n-------------------------------------------') print('Epoch-{}'.format(epoch)) print('-------------------------------------------') model.train() train_iter = enumerate(udc.get_iter('train')) if not args.no_tqdm: train_iter = tqdm(train_iter) train_iter.set_description_str('Training') train_iter.total = udc.n_train // udc.batch_size for it, mb in train_iter: #context, response, y, cm, rm, ql = mb context, response, y, cm, rm, ql, key_r, key_mask_r = mb output = model(context, response, cm, rm, key_r, key_mask_r) #output = model(context, response, cm, rm) #output = model(context, response) loss = F.binary_cross_entropy_with_logits(output, y) # loss = F.mse_loss(F.sigmoid(output), y) loss.backward() #print (model.conv3.grad) #clip_gradient_threshold(model, -10, 10) solver.step() solver.zero_grad() del (context, response, y, output) # Validation recall_at_ks = eval_model_v2(model, udc, 'valid', gpu=args.gpu, no_tqdm=args.no_tqdm) print( 'Loss: {:.3f}; recall@1: {:.3f}; recall@2: {:.3f}; recall@5: {:.3f}' .format(loss.data[0], recall_at_ks[0], recall_at_ks[1], recall_at_ks[4])) recall_1 = recall_at_ks[0] # if epoch > 10: # eval_test() if best_val == 0.0: save_model(model, model_name) best_val = recall_1 else: if recall_1 > best_val: best_val = recall_1 print("Saving model for recall@1:" + str(recall_1)) save_model(model, model_name) else: print("Not saving, best accuracy so far:" + str(best_val))
def encode_collection_with_stemming(text_by_name_p, word_dict_path, w2v_model_path, encoded_out_folder, wi=None, word_embeddings_matrix=None): text_by_name = {} print('reading files in folder') for filename in tqdm(os.listdir(text_by_name_p)): fp = os.path.join(text_by_name_p, filename) if os.path.isfile(fp): text_by_name[filename.split(r'.')[0]] = ' '.join( open(fp, 'r').readlines()) # initialize embeddings matrix if word_embeddings_matrix is None: # read and adapt word index if wi is None: wi = {} wids_to_merge = {} for line in tqdm(open(word_dict_path)): data = line.split() word_stemmed = util.stem(data[0].strip()) wid = int(data[1].strip()) if word_stemmed not in wi.keys(): wi[word_stemmed] = len(wi) wids_to_merge[word_stemmed] = [wid] else: wids_to_merge[word_stemmed].append(wid) we_size = 50 word_embeddings_matrix = np.float32( np.random.uniform(-0.02, 0.02, [len(wi) + 1, we_size])) padding_value = np.zeros(we_size) word_embeddings_matrix[word_embeddings_matrix.shape[0] - 1] = padding_value w2v_model = load_w2v_we(w2v_model_path) for k, v in wi.items(): we = np.zeros(we_size) summed_something = False for wid in wids_to_merge[k]: if wid in w2v_model.keys(): we = np.sum((we, w2v_model[wid]), axis=0) summed_something = True if summed_something: we = we / np.linalg.norm(we) # normalize new word embedding word_embeddings_matrix[v] = we encoded_docs_by_name = {} sw = load_indri_stopwords() print('encoding data') for dn, dc in tqdm(text_by_name.items()): td = util.tokenize(dc, stemming=True, stoplist=sw) encoded_doc = [wi[w] for w in td if w in wi.keys()] util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn)) encoded_docs_by_name[dn] = encoded_doc return encoded_docs_by_name, wi, word_embeddings_matrix
def compute_train_test_q_names(q_names): np.random.seed(0) if not os.path.isfile('test_q_names'): training_q_names = np.random.choice(q_names, 200, replace=False) test_q_names = [qn for qn in q_names if qn not in training_q_names] util.save_model(test_q_names, 'test_q_names') util.save_model(training_q_names, 'train_q_names') else: training_q_names = util.load_model('train_q_names') test_q_names = util.load_model('test_q_names') return training_q_names, test_q_names
def main(): #filter_seasons = set([10]) filter_seasons = None min_size = 56 max_size = 64 training, testing, charset = southpark.load_generative_data( min_size=min_size, max_size=max_size, filter_seasons=filter_seasons, dataset_size=500000) print("Dataset") print(" Training Size : {}".format(len(training[1]))) print(" Testing Size : {}".format(len(testing[1]))) print(" Charset Size : {}".format(len(charset))) print(" Charset : {}".format(charset)) print() print("Creating Model...") model = create_model(charset, max_size) batch_size = 128 use_gpu_multi_batching = False if use_gpu_multi_batching: model = gpu_multi_batch(model, training, testing, charset, batch_size, epochs=5, num_gpu_batches=1000) else: #batch_gen = batch_sample_generator(dataset, charset, batch_size) batch_gen = batch_generator(training, charset, batch_size) print("Fitting Model...") filepath = "script_gen_best.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='acc', verbose=1, save_best_only=True, mode='max') callbacks_list = [checkpoint] model.fit_generator(batch_gen, len(training[1]) // batch_size, epochs=20, use_multiprocessing=False, callbacks=callbacks_list, verbose=1) save_model("script_gen_20_epoch", model, charset, overwrite=True)
def compute_kfolds_train_test(n_folds, q_names, coll): if not os.path.isfile('folds_' + coll): folds = [] q_names = np.array(q_names) kf = KFold(n_splits=n_folds, random_state=0, shuffle=True) for train_index, test_index in kf.split(q_names): q_train, q_test = q_names[train_index], q_names[test_index] folds.append((q_train, q_test)) util.save_model(folds, 'folds_' + coll) else: folds = util.load_model('folds_' + coll) return folds
def main(): best_acc = 0 opt = parse_option() # build data loader train_loader, val_loader = set_loader(opt) # build model and criterion model, criterion = set_model(opt) # build optimizer optimizer = set_optimizer(opt, model) # tensorboard writer = SummaryWriter(log_dir=opt.tb_folder, flush_secs=2) # training routine for epoch in range(1, opt.epochs + 1): adjust_learning_rate(opt, optimizer, epoch) # train for one epoch time1 = time.time() loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, opt) time2 = time.time() print('epoch {}, total time {:.2f}'.format(epoch, time2 - time1)) # tensorboard logger writer.add_scalar('train_loss', loss, global_step=epoch) writer.add_scalar('train_acc', train_acc, global_step=epoch) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], global_step=epoch) # evaluation loss, val_acc = validate(val_loader, model, criterion, opt) writer.add_scalar('val_loss', loss, global_step=epoch) writer.add_scalar('val_acc', val_acc, global_step=epoch) if val_acc > best_acc: best_acc = val_acc if epoch % opt.save_freq == 0: save_file = os.path.join( opt.save_folder, 'ckpt_epoch_{epoch}.pth'.format(epoch=epoch)) save_model(model, optimizer, opt, epoch, save_file) # save the last model save_file = os.path.join(opt.save_folder, 'last.pth') save_model(model, optimizer, opt, opt.epochs, save_file) print('best accuracy: {:.2f}'.format(best_acc))
def compute_data(): ftext_model_path = '../data/fasttext_models/wiki.en.bin' output_path_wi_model = '../data/fasttext_models/wi_robust' output_path_ii_model = '../data/fasttext_models/ii_robust' output_path_idf_model = '../data/fasttext_models/idf_robust' output_path_encoded_d_model = '../data/fasttext_models/encoded_dbn' output_path_encoded_q_model = '../data/fasttext_models/encoded_qbn' output_path_we_matrix_model = '../data/fasttext_models/word_embeddings_matrix_robust' coll_path = '/Users/albertopurpura/ExperimentalCollections/Robust04/processed/corpus' queries_main_folder = '/Users/albertopurpura/ExperimentalCollections/Robust04/processed/topics' output_model_path = 'data/robust/stemmed_coll_model' encoded_out_folder_docs = 'data/robust/stemmed_encoded_docs_ft' stemming = True if not os.path.isfile(output_path_ii_model): print('computing inverted index') ii = compute_inverted_index(coll_path, stemming, output_path_ii_model) util.save_model(ii, output_path_ii_model) else: print('loading inverted index') ii = util.load_model(output_path_ii_model) if not os.path.isfile(output_path_encoded_d_model): text_dbn = read_collection(coll_path, output_model_path, stemming=stemming, stoplist=util.load_indri_stopwords()) encoded_dbn, wi, we_matrix = compute_input_data( text_dbn, ftext_model_path, encoded_out_folder_docs) util.save_model(encoded_dbn, output_path_encoded_d_model) util.save_model(wi, output_path_wi_model) util.save_model(we_matrix, output_path_we_matrix_model) else: encoded_dbn = util.load_model(output_path_encoded_d_model) wi = util.load_model(output_path_wi_model) we_matrix = util.load_model(output_path_we_matrix_model) if not os.path.isfile(output_path_encoded_q_model): encoded_qbn = encode_queries(queries_main_folder, wi, stemming) util.save_model(encoded_qbn, output_path_encoded_q_model) else: encoded_qbn = util.load_model(output_path_encoded_q_model) idf_scores = du.compute_idf(coll_path, stemming, output_path_ii_model, output_path_idf_model) return encoded_dbn, encoded_qbn, we_matrix, wi, ii, idf_scores
def precompute_data(docs_proc_folder, queries_proc_folder, word_dict_path, w2v_model_path, encoded_out_folder_docs, encoded_out_folder_queries, output_path_wi_model, output_path_we_matrix_model, output_path_encoded_q, output_path_encoded_d, run_to_rerank, gt_file): # docs_proc_folder = '/media/alberto/DATA/ExperimentalCollections/ny/ny/nyt_proc_albe_2' # queries_proc_folder = '/media/alberto/DATA/ExperimentalCollections/ny/queries/queries_proc' # word_dict_path = 'data/word_dict.txt' # w2v_model_path = 'data/embed_wiki-pdc_d50_norm' # encoded_out_folder_docs = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/encoded_docs' # encoded_out_folder_queries = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/encoded_queries' # output_path_wi_model = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/word_index_stemmed' # output_path_we_matrix_model = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/word_embeddings_matrix' # output_path_encoded_q = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/q_by_name' # output_path_encoded_d = '/media/alberto/DATA/ExperimentalCollections/ny/encoded_corpus/d_by_name_filtered' dbn, wi, word_embeddings_matrix = encode_collection_with_stemming( docs_proc_folder, word_dict_path, w2v_model_path, encoded_out_folder_docs) util.save_model(wi, output_path_wi_model) util.save_model(word_embeddings_matrix, output_path_we_matrix_model) qbn, wi, word_embeddings_matrix = encode_collection_with_stemming( queries_proc_folder, word_dict_path, w2v_model_path, encoded_out_folder_queries, wi, word_embeddings_matrix) dbn_filtered = keep_only_used_docs(gt_file, run_to_rerank, encoded_out_folder_docs) util.save_model(qbn, output_path_encoded_q) util.save_model(dbn_filtered, output_path_encoded_d)
def main(): args = get_arguments() SEED = args.seed torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False np.random.seed(SEED) util.mkdir('runs') test_acc_file = 'runs/test' + str(args.cosine) + str(args.bna) + str( args.bnd) + '.txt' train_acc_file = 'runs/train' + str(args.cosine) + str(args.bna) + str( args.bnd) + '.txt' open(test_acc_file, 'w') open(train_acc_file, 'w') print('Building model, loading data...\n') if args.cuda: torch.cuda.manual_seed(SEED) model, optimizer, training_generator, test_generator = initialize(args) best_pred_loss = 1000.0 print('\nCheckpoint folder:', args.save, '\n\nCosine:', args.cosine, '\t\tBna:', args.bna, '\t\tBnd:', args.bnd, '\t\tContrastive:', args.cont, '\n\nStart training...\n') for epoch in range(1, args.nEpochs + 1): train_metrics = train(args, model, training_generator, optimizer, epoch) test_metrics, confusion_matrix, ucsd_correct_total, sars_correct_total, ucsd_test_total, sars_test_total \ = validation(args, model, test_generator, epoch, mode='test') best_pred_loss = util.save_model(model, optimizer, args, test_metrics, epoch, best_pred_loss, confusion_matrix) print('COVID-CT Accuracy: {0:.2f}%\tSARS-Cov-2 Accuracy: {1:.2f}%\n'. format(100. * ucsd_correct_total / ucsd_test_total, 100. * sars_correct_total / sars_test_total)) with open(test_acc_file, 'a+') as f: f.write( str(test_metrics.data['correct'] / test_metrics.data['total']) + ' ' + str(optimizer.param_groups[0]['lr']) + ' ' + str(test_metrics.data['loss'] / (test_metrics.data['total'] // args.batch_size + 1)) + '\n') with open(train_acc_file, 'a+') as f: f.write( str(train_metrics.data['correct'] / train_metrics.data['total']) + ' ' + str(optimizer.param_groups[0]['lr']) + ' ' + str(train_metrics.data['loss'] / (train_metrics.data['total'] // args.batch_size + 1)) + '\n') adjust_learning_rate(optimizer, epoch, args)
def compute_training_pairs(fold_idx, coll, n_iter_per_query, gt_file, dbn, qbn): if not os.path.isfile('qn_rd_nrd_pairs_wn2v' + str(fold_idx) + '_' + str(coll) + '_' + str(n_iter_per_query)): rd_b_qry = {} nrd_by_qry = {} for line in open(gt_file): data = line.split() qname = data[0].strip() dname = data[2].strip() if dname not in dbn.keys(): continue rj = int(data[3].strip()) if qname not in rd_b_qry.keys(): rd_b_qry[qname] = [] nrd_by_qry[qname] = [] if rj > 0: rd_b_qry[qname].append(dname) else: nrd_by_qry[qname].append(dname) test_q_names = list(qbn.keys()) np.random.shuffle(test_q_names) qn_rd_nrd_pairs = [] for qn in test_q_names: if qn not in rd_b_qry.keys(): continue tmp_rdocs = np.random.choice(rd_b_qry[qn], n_iter_per_query, replace=True) tmp_nrdocs = np.random.choice(nrd_by_qry[qn], n_iter_per_query, replace=True) for i in range(n_iter_per_query): qn_rd_nrd_pairs.append( (qbn[qn], dbn[tmp_rdocs[i]], dbn[tmp_nrdocs[i]])) np.random.shuffle(qn_rd_nrd_pairs) util.save_model( qn_rd_nrd_pairs, 'qn_rd_nrd_pairs_w2v_gk' + str(fold_idx) + '_' + str(coll)) else: qn_rd_nrd_pairs = util.load_model('qn_rd_nrd_pairs_w2v_gk' + str(fold_idx) + '_' + str(coll)) return qn_rd_nrd_pairs
def main(): # Freeze VAE, only optimize retrieval model solver = optim.Adam(model.retrieval_params, lr=args.lr) for epoch in range(args.n_epoch): print('\n\n-------------------------------------------') print('Epoch-{}'.format(epoch)) print('-------------------------------------------') model.train() train_iter = enumerate(udc.get_iter('train')) if not args.no_tqdm: train_iter = tqdm(train_iter) train_iter.set_description_str('Training') train_iter.total = udc.n_train // udc.batch_size for it, mb in train_iter: context, response, y, cm, rm = mb output = model.forward(context, response, cm) loss = F.binary_cross_entropy_with_logits(output, y) # loss = F.mse_loss(F.sigmoid(output), y) loss.backward() #clip_gradient_threshold(model, -10, 10) solver.step() solver.zero_grad() # Validation recall_at_ks = eval_model_v1(model, udc, 'valid', gpu=args.gpu, no_tqdm=args.no_tqdm) print( 'Loss: {:.3f}; recall@1: {:.3f}; recall@2: {:.3f}; recall@5: {:.3f}' .format(loss.data[0], recall_at_ks[0], recall_at_ks[1], recall_at_ks[4])) if epoch > 4: eval_test() save_model(model, 'GRU_VAE_pretrained')
def compare_models(): """Compares several classifiers by performing Beyesian optimization on each one and then ranking the results. """ train, test = load_data() model_configs = get_model_configs() results = [] for configs in model_configs: best_result = hyperparam_search(configs, train, test) print("top 2 accuracy:", get_top_k_accuracy(best_result["model"], test, k=2)) print(generate_confusion_matrix(best_result["model"], test)) save_model(best_result) results.append(best_result) rank_results(results)
def ge_cmd_learn(): args = parse_arg_learn() # prepare input to GE_learn data = GE_data() data.dat = util.load_data(args.data) data.labeled_features = util.load_labeled_features(args.labeled_features) init_model = GE_model() param = GE_param() if args.l2: param.l2_regularization = args.l2 final_model_path = args.model # print data final_model = GE_learn(data, init_model, param) util.save_model(final_model, final_model_path) return
def train(): args = cli() device = torch.device("cuda" if args.gpu else "cpu") print(f'Device {device}') model = get_img_model(args.hidden_units, args.arch) optimizer = optim.Adam(model.classifier.parameters(), lr=0.001) model.to(device) trainloader, _, validationloader, class_to_idx = load_data(args.data_dir) _train(optimizer, args.epochs, trainloader, validationloader, device, model) save_model(args.save_dir, model, class_to_idx, args.hidden_units, args.arch)
def encode_coll(docs_text_path, wi, output_encoded_coll_path): text_by_name = {} print('reading files in folder') for filename in tqdm(os.listdir(docs_text_path)): fp = os.path.join(docs_text_path, filename) if os.path.isfile(fp): text_by_name[filename.split(r'.')[0]] = ' '.join( open(fp, 'r').readlines()) stoplist = load_indri_stopwords() encoded_coll_by_name = {} print('encoding collection') for tn, tt in tqdm(text_by_name.items()): tokenized = util.tokenize(tt, stemming=False, stoplist=stoplist) encoded_text = [wi[t] for t in tokenized if t in wi.keys()] encoded_coll_by_name[tn] = encoded_text util.save_model(encoded_text, os.path.join(output_encoded_coll_path, tn)) return encoded_coll_by_name
def main(): """ Train the TensorFlow models. """ # Get hyper-parameters if os.path.exists(FLAGS.checkpoint_path) == False: os.makedirs(FLAGS.checkpoint_path) checkpoint_file_path = FLAGS.checkpoint_path + "/checkpoint.ckpt" latest_checkpoint_file_path = tf.train.latest_checkpoint( FLAGS.checkpoint_path) if os.path.exists(FLAGS.output_path) == False: os.makedirs(FLAGS.output_path) # Step 1: Construct the dataset op epoch_number = FLAGS.epoch_number if epoch_number <= 0: epoch_number = -1 train_buffer_size = FLAGS.train_batch_size * 3 validation_buffer_size = FLAGS.train_batch_size * 3 train_filename_list = [filename for filename in FLAGS.train_files.split(",")] train_filename_placeholder = tf.placeholder(tf.string, shape=[None]) if FLAGS.file_format == "tfrecords": train_dataset = tf.data.TFRecordDataset(train_filename_placeholder) train_dataset = train_dataset.map(parse_tfrecords_function).repeat( epoch_number).batch(FLAGS.train_batch_size).shuffle( buffer_size=train_buffer_size) elif FLAGS.file_format == "csv": # Skip the header or not train_dataset = tf.data.TextLineDataset(train_filename_placeholder) train_dataset = train_dataset.map(parse_csv_function).repeat( epoch_number).batch(FLAGS.train_batch_size).shuffle( buffer_size=train_buffer_size) train_dataset_iterator = train_dataset.make_initializable_iterator() train_features_op, train_label_op = train_dataset_iterator.get_next() validation_filename_list = [ filename for filename in FLAGS.validation_files.split(",") ] validation_filename_placeholder = tf.placeholder(tf.string, shape=[None]) if FLAGS.file_format == "tfrecords": validation_dataset = tf.data.TFRecordDataset( validation_filename_placeholder) validation_dataset = validation_dataset.map( parse_tfrecords_function).repeat(epoch_number).batch( FLAGS.validation_batch_size).shuffle( buffer_size=validation_buffer_size) elif FLAGS.file_format == "csv": validation_dataset = tf.data.TextLineDataset( validation_filename_placeholder) validation_dataset = validation_dataset.map(parse_csv_function).repeat( epoch_number).batch(FLAGS.validation_batch_size).shuffle( buffer_size=validation_buffer_size) validation_dataset_iterator = validation_dataset.make_initializable_iterator( ) validation_features_op, validation_label_op = validation_dataset_iterator.get_next( ) # Step 2: Define the model input_units = FLAGS.feature_size output_units = FLAGS.label_size logits = inference(train_features_op, input_units, output_units, True) if FLAGS.loss == "sparse_cross_entropy": cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=train_label_op) loss = tf.reduce_mean(cross_entropy, name="loss") elif FLAGS.loss == "cross_entropy": cross_entropy = tf.nn.cross_entropy_with_logits( logits=logits, labels=train_label_op) loss = tf.reduce_mean(cross_entropy, name="loss") elif FLAGS.loss == "mean_square": msl = tf.square(logits - train_label_op, name="msl") loss = tf.reduce_mean(msl, name="loss") global_step = tf.Variable(0, name="global_step", trainable=False) learning_rate = FLAGS.learning_rate if FLAGS.enable_lr_decay: logging.info( "Enable learning rate decay rate: {}".format(FLAGS.lr_decay_rate)) starter_learning_rate = FLAGS.learning_rate learning_rate = tf.train.exponential_decay( starter_learning_rate, global_step, 100000, FLAGS.lr_decay_rate, staircase=True) optimizer = util.get_optimizer_by_name(FLAGS.optimizer, learning_rate) train_op = optimizer.minimize(loss, global_step=global_step) # Need to re-use the Variables for training and validation tf.get_variable_scope().reuse_variables() # Define accuracy op and auc op for train train_accuracy_logits = inference(train_features_op, input_units, output_units, False) train_softmax_op, train_accuracy_op = model.compute_softmax_and_accuracy( train_accuracy_logits, train_label_op) train_auc_op = model.compute_auc(train_softmax_op, train_label_op, FLAGS.label_size) # Define accuracy op and auc op for validation validation_accuracy_logits = inference(validation_features_op, input_units, output_units, False) validation_softmax_op, validation_accuracy_op = model.compute_softmax_and_accuracy( validation_accuracy_logits, validation_label_op) validation_auc_op = model.compute_auc(validation_softmax_op, validation_label_op, FLAGS.label_size) # Define inference op inference_features = tf.placeholder( "float", [None, FLAGS.feature_size], name="features") inference_logits = inference(inference_features, input_units, output_units, False) inference_softmax_op = tf.nn.softmax( inference_logits, name="inference_softmax") inference_prediction_op = tf.argmax( inference_softmax_op, 1, name="inference_prediction") keys_placeholder = tf.placeholder(tf.int32, shape=[None, 1], name="keys") keys_identity = tf.identity(keys_placeholder, name="inference_keys") signature_def_map = { signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def_utils.build_signature_def( inputs={ "keys": utils.build_tensor_info(keys_placeholder), "features": utils.build_tensor_info(inference_features) }, outputs={ "keys": utils.build_tensor_info(keys_identity), "prediction": utils.build_tensor_info(inference_prediction_op), }, method_name="tensorflow/serving/predictss"), "serving_detail": signature_def_utils.build_signature_def( inputs={ "keys": utils.build_tensor_info(keys_placeholder), "features": utils.build_tensor_info(inference_features) }, outputs={ "keys": utils.build_tensor_info(keys_identity), "prediction": utils.build_tensor_info(inference_prediction_op), "softmax": utils.build_tensor_info(inference_softmax_op), }, method_name="sdfas") } # Initialize saver and summary saver = tf.train.Saver() tf.summary.scalar("loss", loss) if FLAGS.scenario == "classification": tf.summary.scalar("train_accuracy", train_accuracy_op) tf.summary.scalar("train_auc", train_auc_op) tf.summary.scalar("validate_accuracy", validation_accuracy_op) tf.summary.scalar("validate_auc", validation_auc_op) summary_op = tf.summary.merge_all() init_op = [ tf.global_variables_initializer(), tf.local_variables_initializer() ] # Step 3: Create session to run with tf.Session() as sess: writer = tf.summary.FileWriter(FLAGS.output_path, sess.graph) sess.run(init_op) sess.run( [ train_dataset_iterator.initializer, validation_dataset_iterator.initializer ], feed_dict={ train_filename_placeholder: train_filename_list, validation_filename_placeholder: validation_filename_list }) if FLAGS.mode == "train": if FLAGS.resume_from_checkpoint: util.restore_from_checkpoint(sess, saver, latest_checkpoint_file_path) try: start_time = datetime.datetime.now() while True: if FLAGS.enable_benchmark: sess.run(train_op) else: _, global_step_value = sess.run([train_op, global_step]) # Step 4: Display training metrics after steps if global_step_value % FLAGS.steps_to_validate == 0: if FLAGS.scenario == "classification": loss_value, train_accuracy_value, train_auc_value, validate_accuracy_value, validate_auc_value, summary_value = sess.run( [ loss, train_accuracy_op, train_auc_op, validation_accuracy_op, validation_auc_op, summary_op ]) end_time = datetime.datetime.now() logging.info( "[{}] Step: {}, loss: {}, train_acc: {}, train_auc: {}, valid_acc: {}, valid_auc: {}". format(end_time - start_time, global_step_value, loss_value, train_accuracy_value, train_auc_value, validate_accuracy_value, validate_auc_value)) elif FLAGS.scenario == "regression": loss_value, summary_value = sess.run([loss, summary_op]) end_time = datetime.datetime.now() logging.info("[{}] Step: {}, loss: {}".format( end_time - start_time, global_step_value, loss_value)) writer.add_summary(summary_value, global_step_value) saver.save( sess, checkpoint_file_path, global_step=global_step_value) start_time = end_time except tf.errors.OutOfRangeError: if FLAGS.enable_benchmark: logging.info("Finish training for benchmark") else: # Step 5: Export the model after training util.save_model( FLAGS.model_path, FLAGS.model_version, sess, signature_def_map, is_save_graph=False) elif FLAGS.mode == "savedmodel": if util.restore_from_checkpoint(sess, saver, latest_checkpoint_file_path) == False: logging.error("No checkpoint for exporting model, exit now") return util.save_model( FLAGS.model_path, FLAGS.model_version, sess, signature_def_map, is_save_graph=False) elif FLAGS.mode == "inference": if util.restore_from_checkpoint(sess, saver, latest_checkpoint_file_path) == False: logging.error("No checkpoint for inference, exit now") return # Load test data inference_result_file_name = FLAGS.inference_result_file inference_test_file_name = FLAGS.inference_data_file inference_data = np.genfromtxt(inference_test_file_name, delimiter=",") inference_data_features = inference_data[:, 0:9] inference_data_labels = inference_data[:, 9] # Run inference start_time = datetime.datetime.now() prediction, prediction_softmax = sess.run( [inference_prediction_op, inference_softmax_op], feed_dict={inference_features: inference_data_features}) end_time = datetime.datetime.now() # Compute accuracy label_number = len(inference_data_labels) correct_label_number = 0 for i in range(label_number): if inference_data_labels[i] == prediction[i]: correct_label_number += 1 accuracy = float(correct_label_number) / label_number # Compute auc y_true = np.array(inference_data_labels) y_score = prediction_softmax[:, 1] fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score, pos_label=1) auc = metrics.auc(fpr, tpr) logging.info("[{}] Inference accuracy: {}, auc: {}".format( end_time - start_time, accuracy, auc)) # Save result into the file np.savetxt(inference_result_file_name, prediction_softmax, delimiter=",") logging.info( "Save result to file: {}".format(inference_result_file_name))
def main(): if os.path.exists(FLAGS.checkpoint_path) == False: os.makedirs(FLAGS.checkpoint_path) checkpoint_file_path = FLAGS.checkpoint_path + "/checkpoint.ckpt" latest_checkpoint_file_path = tf.train.latest_checkpoint( FLAGS.checkpoint_path) if os.path.exists(FLAGS.output_path) == False: os.makedirs(FLAGS.output_path) # Step 1: Construct the dataset op epoch_number = FLAGS.epoch_number if epoch_number <= 0: epoch_number = -1 train_buffer_size = FLAGS.train_batch_size * 3 validation_buffer_size = FLAGS.train_batch_size * 3 train_filename_list = [filename for filename in FLAGS.train_files.split(",")] train_filename_placeholder = tf.placeholder(tf.string, shape=[None]) train_dataset = tf.data.TFRecordDataset(train_filename_placeholder) train_dataset = train_dataset.map(parse_tfrecords_function).repeat( epoch_number).batch(FLAGS.train_batch_size).shuffle( buffer_size=train_buffer_size) train_dataset_iterator = train_dataset.make_initializable_iterator() batch_labels, batch_ids, batch_values = train_dataset_iterator.get_next() validation_filename_list = [ filename for filename in FLAGS.validation_files.split(",") ] validation_filename_placeholder = tf.placeholder(tf.string, shape=[None]) validation_dataset = tf.data.TFRecordDataset(validation_filename_placeholder) validation_dataset = validation_dataset.map(parse_tfrecords_function).repeat( ).batch(FLAGS.validation_batch_size).shuffle( buffer_size=validation_buffer_size) validation_dataset_iterator = validation_dataset.make_initializable_iterator( ) validation_labels, validation_ids, validation_values = validation_dataset_iterator.get_next( ) # Define the model logits = inference(batch_ids, batch_values, True) batch_labels = tf.to_int64(batch_labels) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=batch_labels) loss = tf.reduce_mean(cross_entropy, name="loss") global_step = tf.Variable(0, name="global_step", trainable=False) if FLAGS.enable_lr_decay: logging.info( "Enable learning rate decay rate: {}".format(FLAGS.lr_decay_rate)) starter_learning_rate = FLAGS.learning_rate learning_rate = tf.train.exponential_decay( starter_learning_rate, global_step, 100000, FLAGS.lr_decay_rate, staircase=True) else: learning_rate = FLAGS.learning_rate optimizer = util.get_optimizer_by_name(FLAGS.optimizer, learning_rate) train_op = optimizer.minimize(loss, global_step=global_step) tf.get_variable_scope().reuse_variables() # Define accuracy op for train data train_accuracy_logits = inference(batch_ids, batch_values, False) train_softmax = tf.nn.softmax(train_accuracy_logits) train_correct_prediction = tf.equal( tf.argmax(train_softmax, 1), batch_labels) train_accuracy = tf.reduce_mean( tf.cast(train_correct_prediction, tf.float32)) # Define auc op for train data batch_labels = tf.cast(batch_labels, tf.int32) sparse_labels = tf.reshape(batch_labels, [-1, 1]) derived_size = tf.shape(batch_labels)[0] indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1]) concated = tf.concat(axis=1, values=[indices, sparse_labels]) outshape = tf.stack([derived_size, FLAGS.label_size]) new_train_batch_labels = tf.sparse_to_dense(concated, outshape, 1.0, 0.0) _, train_auc = tf.contrib.metrics.streaming_auc(train_softmax, new_train_batch_labels) # Define accuracy op for validate data validate_accuracy_logits = inference(validation_ids, validation_values, False) validate_softmax = tf.nn.softmax(validate_accuracy_logits) validate_batch_labels = tf.to_int64(validation_labels) validate_correct_prediction = tf.equal( tf.argmax(validate_softmax, 1), validate_batch_labels) validate_accuracy = tf.reduce_mean( tf.cast(validate_correct_prediction, tf.float32)) # Define auc op for validate data validate_batch_labels = tf.cast(validate_batch_labels, tf.int32) sparse_labels = tf.reshape(validate_batch_labels, [-1, 1]) derived_size = tf.shape(validate_batch_labels)[0] indices = tf.reshape(tf.range(0, derived_size, 1), [-1, 1]) concated = tf.concat(axis=1, values=[indices, sparse_labels]) outshape = tf.stack([derived_size, FLAGS.label_size]) new_validate_batch_labels = tf.sparse_to_dense(concated, outshape, 1.0, 0.0) _, validate_auc = tf.contrib.metrics.streaming_auc(validate_softmax, new_validate_batch_labels) # Define inference op sparse_index = tf.placeholder(tf.int64, [None, 2]) sparse_ids = tf.placeholder(tf.int64, [None]) sparse_values = tf.placeholder(tf.float32, [None]) sparse_shape = tf.placeholder(tf.int64, [2]) inference_ids = tf.SparseTensor(sparse_index, sparse_ids, sparse_shape) inference_values = tf.SparseTensor(sparse_index, sparse_values, sparse_shape) inference_logits = inference(inference_ids, inference_values, False) inference_softmax = tf.nn.softmax(inference_logits) inference_op = tf.argmax(inference_softmax, 1) keys_placeholder = tf.placeholder(tf.int32, shape=[None, 1]) keys = tf.identity(keys_placeholder) signature_def_map = { signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def_utils.build_signature_def( inputs={ "keys": utils.build_tensor_info(keys_placeholder), "indexs": utils.build_tensor_info(sparse_index), "ids": utils.build_tensor_info(sparse_ids), "values": utils.build_tensor_info(sparse_values), "shape": utils.build_tensor_info(sparse_shape) }, outputs={ "keys": utils.build_tensor_info(keys), "softmax": utils.build_tensor_info(inference_softmax), "prediction": utils.build_tensor_info(inference_op) }, method_name=signature_constants.PREDICT_METHOD_NAME) } # Initialize saver and summary saver = tf.train.Saver() tf.summary.scalar("loss", loss) tf.summary.scalar("train_accuracy", train_accuracy) tf.summary.scalar("train_auc", train_auc) tf.summary.scalar("validate_accuracy", validate_accuracy) tf.summary.scalar("validate_auc", validate_auc) summary_op = tf.summary.merge_all() init_op = [ tf.global_variables_initializer(), tf.local_variables_initializer() ] # Create session to run with tf.Session() as sess: writer = tf.summary.FileWriter(FLAGS.output_path, sess.graph) sess.run(init_op) sess.run( train_dataset_iterator.initializer, feed_dict={train_filename_placeholder: train_filename_list}) sess.run( validation_dataset_iterator.initializer, feed_dict={validation_filename_placeholder: validation_filename_list}) if FLAGS.mode == "train": # Restore session and start queue runner util.restore_from_checkpoint(sess, saver, latest_checkpoint_file_path) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) start_time = datetime.datetime.now() try: while not coord.should_stop(): if FLAGS.benchmark_mode: sess.run(train_op) else: _, step = sess.run([train_op, global_step]) # Print state while training if step % FLAGS.steps_to_validate == 0: loss_value, train_accuracy_value, train_auc_value, validate_accuracy_value, auc_value, summary_value = sess.run( [ loss, train_accuracy, train_auc, validate_accuracy, validate_auc, summary_op ]) end_time = datetime.datetime.now() logging.info( "[{}] Step: {}, loss: {}, train_acc: {}, train_auc: {}, valid_acc: {}, valid_auc: {}". format(end_time - start_time, step, loss_value, train_accuracy_value, train_auc_value, validate_accuracy_value, auc_value)) writer.add_summary(summary_value, step) saver.save(sess, checkpoint_file_path, global_step=step) start_time = end_time except tf.errors.OutOfRangeError: if FLAGS.benchmark_mode: print("Finish training for benchmark") exit(0) else: # Export the model after training util.save_model( FLAGS.model_path, FLAGS.model_version, sess, signature_def_map, is_save_graph=False) finally: coord.request_stop() coord.join(threads) elif FLAGS.mode == "save_model": if not util.restore_from_checkpoint(sess, saver, latest_checkpoint_file_path): logging.error("No checkpoint found, exit now") exit(1) util.save_model( FLAGS.model_path, FLAGS.model_version, sess, signature_def_map, is_save_graph=False) elif FLAGS.mode == "inference": if not util.restore_from_checkpoint(sess, saver, latest_checkpoint_file_path): logging.error("No checkpoint found, exit now") exit(1) # Load inference test data inference_result_file_name = "./inference_result.txt" inference_test_file_name = "./data/a8a_test.libsvm" labels = [] feature_ids = [] feature_values = [] feature_index = [] ins_num = 0 for line in open(inference_test_file_name, "r"): tokens = line.split(" ") labels.append(int(tokens[0])) feature_num = 0 for feature in tokens[1:]: feature_id, feature_value = feature.split(":") feature_ids.append(int(feature_id)) feature_values.append(float(feature_value)) feature_index.append([ins_num, feature_num]) feature_num += 1 ins_num += 1 # Run inference start_time = datetime.datetime.now() prediction, prediction_softmax = sess.run( [inference_op, inference_softmax], feed_dict={ sparse_index: feature_index, sparse_ids: feature_ids, sparse_values: feature_values, sparse_shape: [ins_num, FLAGS.feature_size] }) end_time = datetime.datetime.now() # Compute accuracy label_number = len(labels) correct_label_number = 0 for i in range(label_number): if labels[i] == prediction[i]: correct_label_number += 1 accuracy = float(correct_label_number) / label_number # Compute auc expected_labels = np.array(labels) predict_labels = prediction_softmax[:, 0] fpr, tpr, thresholds = metrics.roc_curve( expected_labels, predict_labels, pos_label=0) auc = metrics.auc(fpr, tpr) logging.info("[{}] Inference accuracy: {}, auc: {}".format( end_time - start_time, accuracy, auc)) # Save result into the file np.savetxt(inference_result_file_name, prediction_softmax, delimiter=",") logging.info( "Save result to file: {}".format(inference_result_file_name)) elif FLAGS.mode == "inference_with_tfrecords": if not util.restore_from_checkpoint(sess, saver, latest_checkpoint_file_path): logging.error("No checkpoint found, exit now") exit(1) # Load inference test data inference_result_file_name = "./inference_result.txt" inference_test_file_name = "./data/a8a/a8a_test.libsvm.tfrecords" batch_feature_index = [] batch_labels = [] batch_ids = [] batch_values = [] ins_num = 0 # Read from TFRecords files for serialized_example in tf.python_io.tf_record_iterator( inference_test_file_name): # Get serialized example from file example = tf.train.Example() example.ParseFromString(serialized_example) label = example.features.feature["label"].float_list.value ids = example.features.feature["ids"].int64_list.value values = example.features.feature["values"].float_list.value #print("label: {}, features: {}".format(label, " ".join([str(id) + ":" + str(value) for id, value in zip(ids, values)]))) batch_labels.append(label) # Notice that using extend() instead of append() to flatten the values batch_ids.extend(ids) batch_values.extend(values) for i in xrange(len(ids)): batch_feature_index.append([ins_num, i]) ins_num += 1 # Run inference start_time = datetime.datetime.now() prediction, prediction_softmax = sess.run( [inference_op, inference_softmax], feed_dict={ sparse_index: batch_feature_index, sparse_ids: batch_ids, sparse_values: batch_values, sparse_shape: [ins_num, FLAGS.feature_size] }) end_time = datetime.datetime.now() # Compute accuracy label_number = len(batch_labels) correct_label_number = 0 for i in range(label_number): if batch_labels[i] == prediction[i]: correct_label_number += 1 accuracy = float(correct_label_number) / label_number # Compute auc expected_labels = np.array(batch_labels) predict_labels = prediction_softmax[:, 0] fpr, tpr, thresholds = metrics.roc_curve( expected_labels, predict_labels, pos_label=0) auc = metrics.auc(fpr, tpr) logging.info("[{}] Inference accuracy: {}, auc: {}".format( end_time - start_time, accuracy, auc)) # Save result into the file np.savetxt(inference_result_file_name, prediction_softmax, delimiter=",") logging.info( "Save result to file: {}".format(inference_result_file_name))