def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, type="evaluation") log = util.get_logger(args.save_dir, args.name) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Get your model log.info('Building model...') model, step=get_model(log,args) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') dev_dataset = util.load_dataset(args.test_file,args.PPI_dir,args.PPI_gene_feature_dir, args.PPI_gene_query_dict_dir,args.max_nodes,train=False) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=util.collate_fn) # Train log.info('Evaluating...') #get loss computer cri=FocalLoss(alpha=torch.tensor([args.alpha,1]).to(device),gamma=args.gamma) loss_meter = util.AverageMeter() ground_true = dev_loader.dataset.y_list ground_true = ground_true.to(device) predict_list=torch.zeros([dev_loader.dataset.__len__(),2],dtype=torch.float) predict_list = predict_list.to(device) sample_index=0 with torch.no_grad(), \ tqdm(total=len(dev_loader.dataset)) as progress_bar: for batch_a, batch_bio_a, batch_A, batch_b, batch_bio_b, batch_B, batch_y in dev_loader: # Setup for forward batch_a = batch_a.to(device) batch_bio_a = batch_bio_a.to(device) batch_A = batch_A.to(device) batch_bio_b = batch_bio_b.to(device) batch_b = batch_b.to(device) batch_B = batch_B.to(device) batch_y = batch_y.to(device) batch_y = batch_y.long() batch_size = batch_bio_a.size(0) # Forward output= model(batch_a, batch_bio_a, batch_A, batch_b, batch_bio_b, batch_B) loss = cri(output, batch_y) loss_val = loss.item() loss_meter.update(loss_val, batch_size) predict_list[sample_index:sample_index+batch_size]=output sample_index=sample_index+batch_size # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=loss_meter.avg) results = util.metrics_compute(predict_list, ground_true) log.info("Evaluation result of model:") log.info(f"Loss in test dataset is {loss_meter.avg}") log.info(f"Accuracy:{results['Accuracy']}, AUC:{results['AUC']}, Recall:{results['Recall']},Precision:{results['Precision']},Specificity:{results['Specificity']}") log.info(f"TP:{results['TP']},FN:{results['FN']}") log.info(f"FP:{results['FP']},TN:{results['TN']}") log.info("plot prediction curve...") ROC_AUC(results["fpr"],results["tpr"],results["AUC"],os.path.join(args.save_dir,"ROC_curve.pdf")) log.info("Save evaluation result...") np.savez(os.path.join(args.save_dir,"results.npz"),predict=np.array(predict_list.cpu().tolist()),result=results)
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) seed = 42 torch.manual_seed(seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Get model #log.info(f'Loading checkpoint from {args.load_path}...') model = resnet.resnet50() model = nn.DataParallel(model, gpu_ids) #log.info(f'Loading checkpoint from {args.load_path}...') #model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('loading dataset...') input_data_file = '/home/mahbub/research/flat-resnet/data/dev_images.pt' #vars(args)[f'{args.input_data_file}'] dataset = ImageDataset(input_data_file) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=None) #class_label_file = '/home/mahbub/research/flat-resnet/imagenet_classes.txt' # Read the categories #with open(class_label_file, "r") as f: # categories = [s.strip() for s in f.readlines()] # Evaluate log.info(f'Running inference ...') output = torch.zeros( len(dataset), 1000 ) # TODO: 1000 is number of class or resnet output size, remove hard coding. out_idx = 0 with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for images in data_loader: # Setup for forward images = images.to(device) batch_size = images.shape[0] #print ("batch size is {}".format(batch_size)) #print("Input is : {}".format(images[0,0,0,:10])) # Forward output[out_idx:out_idx + batch_size] = model(images) out_idx += batch_size #print("output shape is {}".format(output.shape)) #print("Output is: {}".format(output)) #probabilities = torch.nn.functional.softmax(output, dim=1) #print("probabilities shape is {}".format(probabilities.shape)) #print ("probabilities sum = {}".format(probabilities.sum(axis=1))) # Show top categories per image #K = 5 #top_prob, top_catid = torch.topk(probabilities, K) #print("top catid shape is {}".format(top_catid.shape)) #for i in range(top_prob.shape[0]): # for k in range(K): # print(categories[top_catid[i,k]], top_prob[i,k].item()) # Log info progress_bar.update(batch_size) # Write output to a file torch.save(output, "resnet50_output")
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # ###################################### # tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True) # train_examples = None # train_examples = read_squad_examples( # input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) # train_features = convert_examples_to_features( # examples=train_examples, # tokenizer=tokenizer, # max_seq_length=args.max_seq_length, # doc_stride=args.doc_stride, # max_query_length=args.max_query_length, # is_training=True) # if args.local_rank == -1 or torch.distributed.get_rank() == 0: # logger.info(" Saving train features into cached file %s", cached_train_features_file) # with open(cached_train_features_file, "wb") as writer: # pickle.dump(train_features, writer) # all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) # x = all_input_ids ########################################### # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # added_flag cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(cw_idxs, qw_idxs) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') if args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.model == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif args.model == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # print("*"*80) # print(len(dataset.question_idxs)) # for question_idx in dataset.question_idxs: # print(question_idx) # print("*" * 80) # print(self.question_idxs[question_idx]) # self.question_idxs[idx] # print("data_loader: ",data_loader) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) # create statistics # print("*"*80) # print(len(gold_dict)) # print(gold_dict['1']['question']) count_questions_type = defaultdict(lambda: 0) audit_trail_from_question_type = defaultdict(lambda: []) list_of_interrogative_pronouns = [ "what", "whose", "why", "which", "where", "when", "how", "who", "whom" ] for index in range(1, len(gold_dict)): # transform the question in lower case to simplify the analysis, thus losing the benefit of the capital letters # possibly indicating the position of the interrogative pronoun in the sentence. question_lower_case = gold_dict[str(index)]['question'].lower() list_question_lower_case_with_punctuation = question_lower_case.translate( {ord(i): " " for i in "'"}).split() # question_lower_case = [] for item in list_question_lower_case_with_punctuation: question_lower_case.append( item.translate({ord(i): "" for i in ",.<>!@£$%^&*()_-+=?"})) # defining a variable for the first word first_word_question_lower_case = question_lower_case[0] # defining variable for the second word second_word_question_lower_case = question_lower_case[1] # defining variable for the first and second word combined_first_and_second_words = first_word_question_lower_case + " " + second_word_question_lower_case #printing on the screen test for debugging purpose # Analyzing the sentence if first_word_question_lower_case in list_of_interrogative_pronouns: count_questions_type[first_word_question_lower_case] += 1 audit_trail_from_question_type[ first_word_question_lower_case].append(str(index)) # composed question starting by in elif first_word_question_lower_case == "in": if second_word_question_lower_case in list_of_interrogative_pronouns and second_word_question_lower_case != "whose": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # composed question starting by by elif first_word_question_lower_case == "by": if second_word_question_lower_case in list_of_interrogative_pronouns \ and second_word_question_lower_case !="whom"\ and second_word_question_lower_case !="which"\ and second_word_question_lower_case !="when"\ and second_word_question_lower_case !="how": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) #if pronoun =="": # print(">>", question_lower_case) # print("@@@", gold_dict[str(index)]['question']) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # if pronoun =="": # print(">>", question_lower_case.split()) # print() #if first_word_question_lower_case == "if": # print(">>", question_lower_case.split()) # print(count_questions_type) # if gold_dict[str(index)]['question'].lower().split()[0] == "in": # print(gold_dict[str(index)]['question']) reverse_dict_by_value = OrderedDict( sorted(count_questions_type.items(), key=lambda x: x[1])) # print(count_questions_type) total_questions = sum(count_questions_type.values()) # print(reverse_dict) #for k, v in reverse_dict_by_value.items(): # print( "%s: %s and in percentage: %s" % (k, v, 100*v/total_questions)) #print(audit_trail_from_question_type) # exit() with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if args.model == 'bidaf': log_p1, log_p2 = model(cw_idxs, qw_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) # Printing information for questions without interrogative pronouns """" print("len(gold_dict): ", len(gold_dict)) print("len(pred_dict): ", len(pred_dict)) print("Is gold_dict.keys() identical to pred_dict.keys(): ", gold_dict.keys()==pred_dict.keys()) if gold_dict.keys()!=pred_dict.keys(): for key in gold_dict.keys(): if key not in pred_dict.keys(): print("key ", key, " missing in pred_dict.keys(") """ results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Computing the F1 score for each type of question # # audit_trail_from_question_type[pronoun].append(str(index)) # create a list of the types of questions by extracting the keys from the dict audit_trail_from_question_type types_of_questions = list(audit_trail_from_question_type.keys()) gold_dict_per_type_of_questions = defaultdict(lambda: []) pred_dict_per_type_of_questions = {} gold_dict_per_type_of_questions_start = {} pred_dict_per_type_of_questions_start = {} gold_dict_per_type_of_questions_middle = {} pred_dict_per_type_of_questions_middle = {} gold_dict_per_type_of_questions_end = {} pred_dict_per_type_of_questions_end = {} for type_of_questions in types_of_questions: #gold_pred = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions]} #lst_pred = {key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions]} # Create two dictionnaries for each type of sentence for gold_dict_per_type_of_questions and pred_dict_per_type_of_questions gold_dict_per_type_of_questions[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1']) gold_dict_per_type_of_questions_start[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_start[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } gold_dict_per_type_of_questions_middle[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_middle[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } gold_dict_per_type_of_questions_end[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_end[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } for key, value in gold_dict.items(): #if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys(): if key in audit_trail_from_question_type[ type_of_questions] and type_of_questions != "" and key in pred_dict_per_type_of_questions[ type_of_questions]: """ print("type_of_questions: ",type_of_questions) print("key: ", key) print("question: ", value["question"]) sub_index = value["question"].lower().find(type_of_questions) print("sub_index: ",sub_index) test_fc = value["question"].lower().find(type_of_questions) print("present type of the var: ",type(test_fc)) #print("question: ", value["question"][str(key)]) print("length of the question: ", len(value["question"])) print('Position of the interrogative pronoun in the question:', ) """ # Create two dictionnaries for each type of sentence based at the start of the sentence if value["question"].lower().find( type_of_questions) == 1 or value["question"].lower( ).find(type_of_questions) == 0: #print("BEGINNING") if type_of_questions != "": try: del gold_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass #pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in # gold_dict_per_type_of_questions_start[ # type_of_questions].keys()} elif value["question"].lower( ).find(type_of_questions) >= len( value["question"]) - len(type_of_questions) - 5: #print("END") if type_of_questions != "": try: del gold_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass #print("type_of_questions: ",type_of_questions) #sub_index = value["question"].lower().find(type_of_questions) #print("sub_index: ", sub_index) #print("len(value['question']) - len(type_of_questions) - 2: ", len(value["question"])-len(type_of_questions)-2) #start_string = len(value["question"])-len(type_of_questions)-6 #end_string = len(value["question"])-1 #print("extract at the end: ", value["question"][start_string:end_string]) else: #print("MIDDLE") if type_of_questions != "": try: del gold_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass pass """ if type_of_questions != "": gold_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in gold_dict.items() if (key in audit_trail_from_question_type[type_of_questions] \ and (value["question"].lower().find(type_of_questions) <= 1) \ and key in pred_dict_per_type_of_questions[type_of_questions]) } """ """ for key in gold_dict_per_type_of_questions_start[type_of_questions].keys(): print("key:: ", key ) print("type(key):: ", type(key) ) print("pred_dict[,key,] : ", pred_dict[key]) print("@@@@@@@@@@@@@@@@@@@@@@@@") pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in gold_dict_per_type_of_questions_start[type_of_questions].keys()} #pred_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in pred_dict.items() if key in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) } # Create two dictionnaries for each type of sentence based at the end of the sentence gold_dict_per_type_of_questions_end[type_of_questions] = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] \ and value["question"].lower().find(type_of_questions) >= len(value["question"])-len(type_of_questions)-2 \ and key in pred_dict_per_type_of_questions[type_of_questions]} pred_dict_per_type_of_questions_end[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} #print("*"*80) # Create two dictionnaries for each type of sentence based at the middle of the sentencecount_questions_type gold_dict_per_type_of_questions_middle[type_of_questions] = {key: value for key, value in gold_dict.items() if key not in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) \ and key not in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} pred_dict_per_type_of_questions_middle[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} else: gold_dict_per_type_of_questions_start[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_start[""] = pred_dict_per_type_of_questions[""] gold_dict_per_type_of_questions_end[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_end[""] = pred_dict_per_type_of_questions[""] gold_dict_per_type_of_questions_middle[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_middle[""] = pred_dict_per_type_of_questions[""] """ positions_in_question = ["beginning", "middle", "end"] # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1']) list_beginning = [ util.eval_dicts( gold_dict_per_type_of_questions_start[type_of_questions], pred_dict_per_type_of_questions_start[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] list_middle = [ util.eval_dicts( gold_dict_per_type_of_questions_middle[type_of_questions], pred_dict_per_type_of_questions_middle[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] list_end = [ util.eval_dicts( gold_dict_per_type_of_questions_end[type_of_questions], pred_dict_per_type_of_questions_end[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] #for type_of_questions in types_of_questions: # print("gold_dict_per_type_of_questions_start[type_of_questions]: ",gold_dict_per_type_of_questions_start[type_of_questions]) # print("pred_dict_per_type_of_questions[type_of_questions]: ",pred_dict_per_type_of_questions[type_of_questions]) F1 = np.array([list_beginning, list_middle, list_end]) m, n = F1.shape value_to_ignore = [] for i in range(m): for j in range(n): if F1[i, j] == "NA" or F1[i, j] == 0: value_to_ignore.append((i, j)) print("value to ignore: ", value_to_ignore) #F1 = np.array([[0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0]]) data_label = copy.deepcopy(F1) for row in data_label: for column_idx in range(len(row)): if row[column_idx] == "NA": row[column_idx] = "" # print question without interrogative pronoun required for the second part of the analysis: for key, value in gold_dict.items(): if key in audit_trail_from_question_type[ ""] and key in pred_dict.keys(): print("question: ", gold_dict_per_type_of_questions['']) print("golden answers: ", ) print("prediction: ", pred_dict[key]) print() fig, ax = plt.subplots() types_of_questions[types_of_questions.index( "")] = "Implicit question without interrogative pronoun" im, cbar = heatmap(F1, positions_in_question, types_of_questions, ax=ax, \ cmap="YlGn", cbarlabel="F1 scores") texts = annotate_heatmap(im, data=data_label, valfmt="{x:.1f}", ignore=value_to_ignore) fig.tight_layout() plt.show() # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(args): # Set up logging and devices name = "train_exp2" args.save_dir = util.get_save_dir(args.logging_dir, name, training=True) log = get_logger(args.save_dir, name) tbx = SummaryWriter(args.save_dir) device, gpu_ids = util.get_available_devices() log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}") args.batch_size *= max(1, len(gpu_ids)) # Set random seed log.info(f"Using random seed {args.random_seed}...") random.seed(args.random_seed) np.random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) # Get embeddings log.info(f"Loading embeddings from {args.word_emb_file}...") word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info("Building model...") model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, gpu_ids) if args.load_path: log.info(f"Loading checkpoint from {args.load_path}...") model, step = util.load_model(model, args.load_path, gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.learning_rate, weight_decay=args.learning_rate_decay) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR scheduler = sched.ReduceLROnPlateau(optimizer=optimizer, mode="min", factor=0.1, patience=2, verbose=True, cooldown=0 min_lr=0.0005) for epoch in range(args.num_epochs): log.info(f"Starting epoch {epoch}...") for i in range(args.num_train_chunks): # Get data loader train_rec_file = f"{args.train_record_file_exp2}_{i}.npz" log.info(f'Building dataset from {train_rec_file} ...') train_dataset = SQuAD(train_rec_file, args.exp2_train_topic_contexts, use_v2=True) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = 0 # torch.set_num_threads(7) with torch.enable_grad(), tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = qw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f"Evaluating at step {step}...") ema.assign(model) for i in range(args.num_dev_chunks): # Get data loader all_pred_dicts = {} all_results = OrderedDict() dev_rec_file = f"{args.dev_record_file_exp2}_{i}.npz" log.info(f'Building evaluating dataset from {dev_rec_file} ...') dev_dataset = SQuAD(dev_rec_file, args.exp2_dev_topic_contexts, use_v2=True) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, use_squad_v2=True) all_results.update(results) all_pred_dicts.update(pred_dict) del dev_dataset del dev_loader del results del pred_dict torch.cuda.empty_cache() saver.save(step, model, all_results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in all_results.items()) log.info(f"Dev {results_str}") # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in all_results.items(): tbx.add_scalar(f"dev/{k}", v, step) util.visualize(tbx, pred_dict=all_pred_dicts, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) torch.cuda.empty_cache() del train_dataset del train_loader torch.cuda.empty_cache()
def main(): #save_dir = util.get_save_dir('save','vgglinear', training=False) #log = util.get_logger(save_dir, 'vgglinear') save_dir = util.get_save_dir('save', 'TimeCNN', training=False) log = util.get_logger(save_dir, 'TimeCNN') device, gpu_ids = util.get_available_devices() tbx = SummaryWriter(save_dir) 'save/train/TimeCNN-wd0.01-epoch100-01/best.pth.tar' #path = 'save/train/Resnet-82/best.pth.tar' #path = 'save/train/TimeCNN-epoch30-1024-01/best.pth.tar' #path = 'save/train/vgglinear-02/best.pth.tar' #build model here log.info("Building model") #model = Baseline(8 * 96 * 64) model = TimeCNN() #model = Resnet() #model = VGGLinear() model = nn.DataParallel(model, gpu_ids) model = util.load_model(model, path, gpu_ids, return_step=False) model = model.to(device) model = model.double() model.eval() log.info("Building Dataset") test_dataset = Shots("videos/test.h5py", "labels/test.npy") test_loader = data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, collate_fn=collate_fn) num_correct = 0 num_samples = 0 missed_1, missed_0 = 0, 0 num_1_predicted = 0 num_0_predicted = 0 with torch.no_grad(): for frames, y in test_loader: frames = frames.to(device) y = y.to(device) scores = model(frames) loss = F.cross_entropy(scores, y) _, preds = scores.max(1) num_correct += (preds == y).sum() # This accumulates how many 1's and 0's were misclassified for i in range(y.shape[0]): if y[i] == 1 and preds[i] == 0: missed_1 += 1 elif y[i] == 0 and preds[i] == 1: missed_0 += 1 num_samples += preds.shape[0] num_1_predicted += (preds == 1).sum() num_0_predicted += (preds == 0).sum() acc = float(num_correct) / num_samples log.info("Path: {}".format(path)) log.info("Accuracy on test set is {}".format(acc)) log.info("Missed 1's: {}, Missed 0's: {}".format(missed_1, missed_0)) log.info("Number 1's predicted: {}".format(num_1_predicted)) log.info("Number 0's predicted: {}".format(num_0_predicted)) log.info('-----------------') log.info("Best Accuracy on test set is {} and path was {}".format( best_accuracy, best_path))
def main(course_dir, text_embedding_size, audio_embedding_size, image_embedding_size, hidden_size, drop_prob, max_text_length, out_heatmaps_dir, args, batch_size=3, num_epochs=100): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Create Dataset objects text_dataset = TextDataset(course_dir, max_text_length) audio_dataset = AudioDataset(course_dir) target_dataset = TargetDataset(course_dir) # Preprocess the image in prescribed format normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.RandomResizedCrop(256), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) image_dataset = ImageDataset(course_dir, transform) assert len(text_dataset) == len(audio_dataset) and len( audio_dataset) == len(image_dataset) and len(image_dataset) == len( target_dataset), "Unequal dataset lengths" # Creating data indices for training and validation splits: train_indices, val_indices = gen_train_val_indices(text_dataset) # Creating PT data samplers and loaders: train_sampler = torch.utils.data.SequentialSampler(train_indices) val_sampler = torch.utils.data.SequentialSampler(val_indices) # Get sentence embeddings train_text_loader = torch.utils.data.DataLoader(text_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=train_sampler) val_text_loader = torch.utils.data.DataLoader(text_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=val_sampler) # Get Audio embeddings train_audio_loader = torch.utils.data.DataLoader(audio_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=train_sampler) val_audio_loader = torch.utils.data.DataLoader(audio_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=val_sampler) # Get images train_image_loader = torch.utils.data.DataLoader(image_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=train_sampler) val_image_loader = torch.utils.data.DataLoader(image_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=val_sampler) # Load Target text train_target_loader = torch.utils.data.DataLoader( target_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=target_collator, sampler=train_sampler) val_target_loader = torch.utils.data.DataLoader(target_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=target_collator, sampler=val_sampler) # print("lens - train_text_loader {}, val_text_loader {}".format(len(train_text_loader), len(val_text_loader))) # print("lens - train_audio_loader {}, val_audio_loader {}".format(len(train_audio_loader), len(val_audio_loader))) # print("lens - train_image_loader {}, val_image_loader {}".format(len(train_image_loader), len(val_image_loader))) # print("lens - train_target_loader {}, val_target_loader {}".format(len(train_target_loader), len(val_target_loader))) # Create model model = MMBiDAF(hidden_size, text_embedding_size, audio_embedding_size, image_embedding_size, device, drop_prob, max_text_length) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # For exponential moving average # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Need to change the metric name # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Let's do this! loss = 0 eps = 1e-8 log.info("Training...") steps_till_eval = args.eval_steps epoch = step // len(TextDataset(course_dir, max_text_length)) while epoch != args.num_epochs: epoch += 1 log.info("Starting epoch {epoch}...") count_item = 0 loss_epoch = 0 with torch.enable_grad(), tqdm( total=len(train_text_loader.dataset)) as progress_bar: for (batch_text, original_text_lengths), ( batch_audio, original_audio_lengths), ( batch_images, original_img_lengths), (batch_target_indices, batch_source_paths, batch_target_paths, original_target_len) in zip( train_text_loader, train_audio_loader, train_image_loader, train_target_loader): loss = 0 max_dec_len = torch.max( original_target_len ) # TODO check error : max decoder timesteps for each batch # Transfer tensors to GPU batch_text = batch_text.to(device) log.info("Loaded batch text") batch_audio = batch_audio.to(device) log.info("Loaded batch audio") batch_images = batch_images.to(device) log.info("Loaded batch image") batch_target_indices = batch_target_indices.to(device) log.info("Loaded batch targets") # Setup for forward batch_size = batch_text.size(0) optimizer.zero_grad() log.info("Starting forward pass") # Forward batch_out_distributions, loss = model( batch_text, original_text_lengths, batch_audio, original_audio_lengths, batch_images, original_img_lengths, batch_target_indices, original_target_len, max_dec_len) loss_val = loss.item() # numerical value of loss loss_epoch = loss_epoch + loss_val log.info("Starting backward") # Backward loss.backward() nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) # To tackle exploding gradients optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) # TODO # scores, results = evaluate(model, dev_loader, device, # args.dev_eval_file, # args.max_ans_len, # args.use_squad_v2) saver.save(step, model, device) ema.resume(model) # Generate summary print('Generated summary for iteration {}: '.format(epoch)) summaries = get_generated_summaries(batch_out_distributions, original_text_lengths, batch_source_paths) print(summaries) # Evaluation # rouge = Rouge() # rouge_scores = rouge.get_scores(batch_source_paths, batch_target_paths, avg=True) # print('Rouge score at iteration {} is {}: '.format(epoch, rouge_scores)) # Generate Output Heatmaps # sns.set() # for idx in range(len(out_distributions)): # out_distributions[idx] = out_distributions[idx].squeeze(0).detach().numpy() # Converting each timestep distribution to numpy array # out_distributions = np.asarray(out_distributions) # Converting the timestep list to array # ax = sns.heatmap(out_distributions) # fig = ax.get_figure() # fig.savefig(out_heatmaps_dir + str(epoch) + '.png') print("Epoch loss is : {}".format(loss_epoch / count_item))
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vec = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') if args.name == 'baseline': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'charembeddings': model = BiDAFChar(word_vectors=word_vectors, char_vec=char_vec, word_len=16, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'charembeddings2': model = BiDAFChar2(word_vectors=word_vectors, char_vec=char_vec, word_len=16, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'qanet': model = QANet(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, total_prob=args.total_drop, final_prob=args.final_prob) elif args.name == 'qanet2': model = QANet2(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, rel=args.rel_att, total_prob=args.total_drop, final_prob=args.final_prob, freeze=args.freeze_emb) elif args.name == 'qanet3': model = QANet3(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, rel=args.rel_att, total_prob=args.total_drop, final_prob=args.final_prob, freeze=args.freeze_emb) elif args.name == 'qanet4': model = QANet4(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, rel=args.rel_att, total_prob=args.total_drop, final_prob=args.final_prob, freeze=args.freeze_emb) else: raise ValueError('Wrong model name') model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler if args.name == 'qanet': optimizer = optim.Adam(model.parameters(), args.lr, betas=(0.8, 0.999), weight_decay=3 * 1e-7, eps=1e-7) scheduler = warmup(optimizer, 1, 2000) elif args.opt == 'adam': if args.grad_cent: optimizer = AdamWGC(model.parameters(), args.lr, betas=(0.9, 0.999), weight_decay=3 * 1e-7, eps=1e-7, use_gc=True) else: optimizer = AdamW(model.parameters(), args.lr, betas=(0.8, 0.999), weight_decay=3 * 1e-7, eps=1e-7) scheduler = warmup(optimizer, 1, 2000) elif args.opt == 'adadelta': optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=3 * 1e-7) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR elif args.opt == 'sgd': optimizer = optim.SGD(model.parameters(), args.lr, weight_decay=3 * 1e-7) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) i = 0 while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() i += 1 loss /= args.acc_step # Backward loss.backward() if i % args.acc_step == 0: nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(i // (args.acc_step)) ema(model, i // (args.acc_step)) optimizer.zero_grad() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0 and i % args.acc_step == 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') if args.model_name == 'sketchy': model = SketchyReader(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, char_embed_drop_prob=args.char_embed_drop_prob, num_heads=args.num_heads, drop_prob=args.drop_prob) # SKETCHY elif args.model_name == 'intensive': model = IntensiveReader(word_vectors=word_vectors, char_vectors=char_vectors, num_heads=args.num_heads, char_embed_drop_prob=args.char_embed_drop_prob, hidden_size=args.hidden_size, drop_prob=args.drop_prob) # INTENSIVE elif args.model_name == 'retro': model = RetroQANet(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, num_heads=args.num_heads, char_embed_drop_prob=args.char_embed_drop_prob, intensive_path=args.load_path_i, sketchy_path=args.load_path_s, gpu_ids=args.gpu_ids, drop_prob=args.drop_prob) # Outer model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # setup losses bceLoss = nn.BCELoss() # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) if args.optim == "adam": optimizer = optim.Adam( model.parameters(), 0.001, betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: counter = 0 epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: counter += 1 # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward y1, y2 = y1.to(device), y2.to(device) if args.model_name == 'sketchy': yi = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = bceLoss(yi, torch.where( y1 == 0, 0, 1).type(torch.FloatTensor)) elif args.model_name == 'intensive': yi, log_p1, log_p2 = model( cw_idxs, qw_idxs, cc_idxs, qc_idxs) # if counter % 100 == 0: #print(torch.max(log_p1.exp(), dim=1)[0]) # $print(torch.max(log_p2.exp(), dim=1)[0]) #weights = torch.ones(log_p1.shape[1]) #weights[0] = 2/(log_p1.shape[1]) #nll_loss = nn.NLLLoss(weight=weights.to(device='cuda:0')) # gt_0 = torch.zeros(yi.shape[0]).to(device) # gt_1 = torch.ones(yi.shape[0]).to(device) loss = args.alpha_1 * bceLoss(yi, torch.where(y1 == 0, 0, 1).type( torch.FloatTensor)) + args.alpha_2 * (F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)) #loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) elif args.model_name == 'retro': log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) else: raise ValueError( 'invalid --model_name, sketchy or intensive required') loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/' + args.model_name, loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, model_name=args.model_name, a1=args.alpha_1, a2=args.alpha_2) saver.save( step, model, results[args.metric_name], device, model_name=args.model_name) ema.resume(model) # Log to console results_str = ', '.join( f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SegmentSQuAD(args.train_record_file, args.use_squad_v2) #train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() y1, y2 = y1.to(device), y2.to(device) # Forward loss = 0 for i in range(batch_size): max_p_sum = 0 max_p_sum_idx = 0 for j in range(cw_idxs.size(1)): # Deal with the case when all the words in the window are padded words if cw_idxs[i, j].sum().item() == 0: continue log_p1_j, log_p2_j = model(cw_idxs[i, j].unsqueeze(0), qw_idxs[i].unsqueeze(0)) max_log_p1_j = torch.max(log_p1_j.detach()) max_log_p2_j = torch.max(log_p2_j.detach()) max_p_sum_idx = j if (max_log_p1_j + max_log_p2_j ) > max_p_sum else max_p_sum_idx max_p_sum = max_log_p1_j + max_log_p2_j if ( max_log_p1_j + max_log_p2_j) > max_p_sum else max_p_sum log_p1_max, log_p2_max = model( cw_idxs[i, max_p_sum_idx].unsqueeze(0), qw_idxs[i].unsqueeze(0)) # Adjust label to the window case if max_p_sum_idx * train_dataset.stride + torch.argmax( log_p1_max).item() == y1[i].item(): loss += F.nll_loss( log_p1_max, torch.argmax(log_p1_max).unsqueeze(0)) else: loss += F.nll_loss( log_p1_max, torch.argmin(log_p1_max).unsqueeze(0)) if max_p_sum_idx * train_dataset.stride + torch.argmax( log_p2_max).item() == y2[i].item(): loss += F.nll_loss( log_p2_max, torch.argmax(log_p2_max).unsqueeze(0)) else: loss += F.nll_loss( log_p2_max, torch.argmin(log_p2_max).unsqueeze(0)) loss_val = loss.item() # # Forward # log_p1, log_p2 = model(cw_idxs, qw_idxs) # loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) # loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(): set_random_seed() # Arguments opt = args.get_setup_args() #cuda = True if torch.cuda.is_available() else False device, gpu_ids = util.get_available_devices() num_classes = opt.num_classes noise_dim = opt.latent_dim + opt.num_classes def weights_init(m): classname = m.__class__.__name__ if classname.find('Conv') != -1: nn.init.normal_(m.weight.data, 0.0, 0.02) elif classname.find('BatchNorm') != -1: nn.init.normal_(m.weight.data, 1.0, 0.02) nn.init.constant_(m.bias.data, 0) train_images_path = os.path.join(opt.data_path, "train") val_images_path = os.path.join(opt.data_path, "val") output_model_path = os.path.join(opt.output_path, opt.version) output_train_images_path = os.path.join(opt.output_path, opt.version, "train") output_sample_images_path = os.path.join(opt.output_path, opt.version, "sample") output_nn_images_path = os.path.join(opt.output_path, opt.version, "nn") output_const_images_path = os.path.join(opt.output_path, opt.version, "constant_sample") os.makedirs(output_train_images_path, exist_ok=True) os.makedirs(output_sample_images_path, exist_ok=True) os.makedirs(output_nn_images_path, exist_ok=True) os.makedirs(output_const_images_path, exist_ok=True) train_set = datasets.ImageFolder(root=train_images_path, transform=transforms.Compose([ transforms.Resize( (opt.img_size, opt.img_size)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ])) dataloader = torch.utils.data.DataLoader(train_set, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers) dataloader_nn = torch.utils.data.DataLoader(train_set, batch_size=1, num_workers=opt.num_workers) gen = fcgan.Generator(noise_dim).to(device) disc = fcgan.Discriminator(num_classes).to(device) gen.apply(weights_init) disc.apply(weights_init) optimG = optim.Adam(gen.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2)) optimD = optim.Adam(disc.parameters(), lr=opt.lr, betas=(opt.b1, opt.b2)) #optimD = optim.SGD(disc.parameters(), lr=opt.lr_sgd) adversarial_loss = torch.nn.BCELoss() auxiliary_loss = torch.nn.CrossEntropyLoss() real_label_val = 1 #real_label_smooth_val = 0.9 real_label_low = 0.75 real_label_high = 1.0 fake_label_val = 0 c_fake_label = opt.num_classes # Probability of adding label noise during discriminator training label_noise_prob = 0.05 # Keep track of losses, accuracy, FID G_losses = [] D_losses = [] D_acc = [] FIDs = [] val_epochs = [] # Define a fixed noise vector for consistent samples z_const = torch.randn( (num_classes * opt.num_sample_images, opt.latent_dim)).to(device) def print_labels(): for class_name in train_set.classes: print("{} -> {}".format(class_name, train_set.class_to_idx[class_name])) def eval_fid(gen_images_path, eval_images_path): print("Calculating FID...") fid = fid_score.calculate_fid_given_paths( (gen_images_path, eval_images_path), opt.batch_size, device) return fid def validate(keep_images=True): # Put G in eval mode gen.eval() val_set = datasets.ImageFolder(root=val_images_path, transform=transforms.Compose([ transforms.Resize( (opt.img_size, opt.img_size)), transforms.ToTensor() ])) val_loader = torch.utils.data.DataLoader(val_set, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers) output_images_path = os.path.join(opt.output_path, opt.version, "val") os.makedirs(output_images_path, exist_ok=True) output_source_images_path = val_images_path + "_" + str(opt.img_size) source_images_available = True if (not os.path.exists(output_source_images_path)): os.makedirs(output_source_images_path) source_images_available = False images_done = 0 for _, data in enumerate(val_loader, 0): images, labels = data batch_size = images.size(0) noise = torch.randn((batch_size, opt.latent_dim)).to(device) labels = torch.randint(0, num_classes, (batch_size, )).to(device) labels_onehot = F.one_hot(labels, num_classes) noise = torch.cat((noise, labels_onehot.to(dtype=torch.float)), 1) gen_images = gen(noise) for i in range(images_done, images_done + batch_size): vutils.save_image(gen_images[i - images_done, :, :, :], "{}/{}.jpg".format(output_images_path, i), normalize=True) if (not source_images_available): vutils.save_image(images[i - images_done, :, :, :], "{}/{}.jpg".format( output_source_images_path, i), normalize=True) images_done += batch_size # Put G back in train mode gen.train() fid = eval_fid(output_images_path, output_source_images_path) if (not keep_images): print("Deleting images generated for validation...") rmtree(output_images_path) return fid def get_dist(img1, img2): return torch.dist(img1, img2, p=1) def get_nn(images, class_label): nn = [None] * len(images) dist = [np.inf] * len(images) for e, data in enumerate(dataloader_nn, 0): img, label = data if label != class_label: continue img = img.to(device) for i in range(len(images)): d = get_dist(images[i], img) if d < dist[i]: dist[i] = d nn[i] = img r = torch.stack(nn, dim=0).squeeze().to(device) #print(r.shape) return r def get_nearest_neighbour(sample_images, num_images): all_nn = [] for i in range(num_classes): nearest_n = get_nn( sample_images[i * num_images:(i + 1) * num_images], i) class_nn = torch.stack([ sample_images[i * num_images:(i + 1) * num_images], nearest_n ], dim=0).squeeze().view( -1, 3, opt.img_size, opt.img_size).to(device) all_nn.append(class_nn) #r = torch.stack(nn, dim=0).squeeze().view(-1, 3, opt.img_size, opt.img_size).to(device) #print(r.shape) return all_nn def get_onehot_labels(num_images): labels = torch.zeros(num_images, 1).to(device) for i in range(num_classes - 1): temp = torch.ones(num_images, 1).to(device) + i labels = torch.cat([labels, temp], 0) labels_onehot = torch.zeros(num_images * num_classes, num_classes).to(device) labels_onehot.scatter_(1, labels.to(torch.long), 1) return labels_onehot def sample_images(num_images, batches_done, isLast): # Sample noise - declared once at the top to maintain consistency of samples z = torch.randn((num_classes * num_images, opt.latent_dim)).to(device) ''' labels = torch.zeros((num_classes * num_images,), dtype=torch.long).to(device) for i in range(num_classes): for j in range(num_images): labels[i*num_images + j] = i labels_onehot = F.one_hot(labels, num_classes) ''' labels_onehot = get_onehot_labels(num_images) z = torch.cat((z, labels_onehot.to(dtype=torch.float)), 1) sample_imgs = gen(z) z_const_cat = torch.cat((z_const, labels_onehot.to(dtype=torch.float)), 1) const_sample_imgs = gen(z_const_cat) vutils.save_image(sample_imgs.data, "{}/{}.png".format(output_sample_images_path, batches_done), nrow=num_images, padding=2, normalize=True) vutils.save_image(const_sample_imgs.data, "{}/{}.png".format(output_const_images_path, batches_done), nrow=num_images, padding=2, normalize=True) if isLast: print( "Estimating nearest neighbors for the last samples, this takes a few minutes..." ) nearest_neighbour_imgs_list = get_nearest_neighbour( sample_imgs, num_images) for label, nn_imgs in enumerate(nearest_neighbour_imgs_list): vutils.save_image(nn_imgs.data, "{}/{}_{}.png".format( output_nn_images_path, batches_done, label), nrow=num_images, padding=2, normalize=True) nearest_neighbour_imgs_list = get_nearest_neighbour( const_sample_imgs, num_images) for label, nn_imgs in enumerate(nearest_neighbour_imgs_list): vutils.save_image(nn_imgs.data, "{}/const_{}_{}.png".format( output_nn_images_path, batches_done, label), nrow=num_images, padding=2, normalize=True) print("Saved nearest neighbors.") def save_loss_plot(path): plt.figure(figsize=(10, 5)) plt.title("Generator and Discriminator Loss During Training") plt.plot(G_losses, label="G") plt.plot(D_losses, label="D") plt.xlabel("iterations") plt.ylabel("Loss") plt.legend() plt.savefig(path) plt.close() def save_acc_plot(path): plt.figure(figsize=(10, 5)) plt.title("Discriminator Accuracy") plt.plot(D_acc) plt.xlabel("iterations") plt.ylabel("accuracy") plt.savefig(path) plt.close() def save_fid_plot(FIDs, epochs, path): #N = len(FIDs) plt.figure(figsize=(10, 5)) plt.title("FID on Validation Set") plt.plot(epochs, FIDs) plt.xlabel("epochs") plt.ylabel("FID") #plt.xticks([i * 49 for i in range(1, N+1)]) plt.savefig(path) plt.close() def expectation_loss(real_feature, fake_feature): norm = torch.norm(real_feature - fake_feature) total = torch.abs(norm).sum() return norm / total print("Label to class mapping:") print_labels() for epoch in range(1, opt.num_epochs + 1): for i, data in enumerate(dataloader, 0): images, class_labels = data images = images.to(device) class_labels = class_labels.to(device) batch_size = images.size(0) #real_label_smooth = torch.full((batch_size,), real_label_smooth_val, device=device) real_label_smooth = ( real_label_low - real_label_high) * torch.rand( (batch_size, ), device=device) + real_label_high real_label = torch.full((batch_size, ), real_label_val, device=device) fake_label = torch.full((batch_size, ), fake_label_val, device=device) ############################ # Train Discriminator ########################### ## Train with all-real batch optimD.zero_grad() real_pred, real_aux = disc(images) mask = torch.rand( (batch_size, ), device=device) <= label_noise_prob mask = mask.type(torch.float) noisy_label = torch.mul(1 - mask, real_label_smooth) + torch.mul( mask, fake_label) d_real_loss = (adversarial_loss(real_pred, noisy_label) + auxiliary_loss(real_aux, class_labels)) / 2 # Train with fake batch noise = torch.randn((batch_size, opt.latent_dim)).to(device) gen_class_labels = torch.randint(0, num_classes, (batch_size, )).to(device) gen_class_labels_onehot = F.one_hot(gen_class_labels, num_classes) noise = torch.cat( (noise, gen_class_labels_onehot.to(dtype=torch.float)), 1) gen_images = gen(noise) fake_pred, fake_aux = disc(gen_images.detach()) mask = torch.rand( (batch_size, ), device=device) <= label_noise_prob mask = mask.type(torch.float) noisy_label = torch.mul(1 - mask, fake_label) + torch.mul( mask, real_label_smooth) c_fake = c_fake_label * torch.ones_like(gen_class_labels).to( device) d_fake_loss = (adversarial_loss(fake_pred, noisy_label) + auxiliary_loss(fake_aux, c_fake)) / 2 # Total discriminator loss d_loss = (d_real_loss + d_fake_loss) / 2 # Calculate discriminator accuracy pred = np.concatenate( [real_aux.data.cpu().numpy(), fake_aux.data.cpu().numpy()], axis=0) gt = np.concatenate([ class_labels.data.cpu().numpy(), gen_class_labels.data.cpu().numpy() ], axis=0) d_acc = np.mean(np.argmax(pred, axis=1) == gt) d_loss.backward() optimD.step() ############################ # Train Generator ########################### optimG.zero_grad() validity, aux_scores = disc(gen_images) g_loss = 0.5 * (adversarial_loss(validity, real_label) + auxiliary_loss(aux_scores, gen_class_labels) ) # + expectation_loss(gen_features, r_f1) g_loss.backward() optimG.step() # Save losses and accuracy for plotting G_losses.append(g_loss.item()) D_losses.append(d_loss.item()) D_acc.append(d_acc) # Output training stats if i % opt.print_every == 0: print( "[Epoch %d/%d] [Batch %d/%d] [D loss: %.4f, acc: %d%%] [G loss: %.4f]" % (epoch, opt.num_epochs, i, len(dataloader), d_loss.item(), 100 * d_acc, g_loss.item())) batches_done = epoch * len(dataloader) + i # Generate and save sample images isLast = ((epoch == opt.num_epochs - 1) and (i == len(dataloader) - 1)) if (batches_done % opt.sample_interval == 0) or isLast: # Put G in eval mode gen.eval() with torch.no_grad(): sample_images(opt.num_sample_images, batches_done, isLast) vutils.save_image(gen_images.data[:36], "{}/{}.png".format(output_train_images_path, batches_done), nrow=6, padding=2, normalize=True) # Put G back in train mode gen.train() # Save model checkpoint if (epoch != opt.num_epochs and epoch % opt.checkpoint_epochs == 0): print("Checkpoint at epoch {}".format(epoch)) print("Saving G & D loss plot...") save_loss_plot( os.path.join(opt.output_path, opt.version, "loss_plot_{}.png".format(epoch))) print("Saving D accuracy plot...") save_acc_plot( os.path.join(opt.output_path, opt.version, "accuracy_plot_{}.png".format(epoch))) print("Validating model...") with torch.no_grad(): fid = validate(keep_images=False) print("Validation FID: {}".format(fid)) with open(os.path.join(opt.output_path, opt.version, "FIDs.txt"), "a") as f: f.write("Epoch: {}, FID: {}\n".format(epoch, fid)) FIDs.append(fid) val_epochs.append(epoch) print("Saving FID plot...") save_fid_plot( FIDs, val_epochs, os.path.join(opt.output_path, opt.version, "fid_plot_{}.png".format(epoch))) print("Saving model checkpoint...") torch.save( { 'epoch': epoch, 'g_state_dict': gen.state_dict(), 'd_state_dict': disc.state_dict(), 'g_optimizer_state_dict': optimG.state_dict(), 'd_optimizer_state_dict': optimD.state_dict(), 'g_loss': g_loss.item(), 'd_loss': d_loss.item(), 'd_accuracy': d_acc, 'val_fid': fid }, os.path.join(output_model_path, "model_checkpoint_{}.tar".format(epoch))) print("Saving final G & D loss plot...") save_loss_plot(os.path.join(opt.output_path, opt.version, "loss_plot.png")) print("Done!") print("Saving final D accuracy plot...") save_acc_plot( os.path.join(opt.output_path, opt.version, "accuracy_plot.png")) print("Done!") print("Validating final model...") gen.eval() with torch.no_grad(): fid = validate() print("Final Validation FID: {}".format(fid)) with open(os.path.join(opt.output_path, opt.version, "FIDs.txt"), "a") as f: f.write("Epoch: {}, FID: {}\n".format(epoch, fid)) FIDs.append(fid) val_epochs.append(epoch) print("Saving final FID plot...") save_fid_plot(FIDs, val_epochs, os.path.join(opt.output_path, opt.version, "fid_plot")) print("Done!") print("Saving final model...") torch.save( { 'epoch': epoch, 'g_state_dict': gen.state_dict(), 'd_state_dict': disc.state_dict(), 'g_optimizer_state_dict': optimG.state_dict(), 'd_optimizer_state_dict': optimD.state_dict(), 'g_loss': g_loss.item(), 'd_loss': d_loss.item(), 'd_accuracy': d_acc, 'val_fid': fid }, os.path.join(output_model_path, "model.tar")) print("Done!")
def train_QaNet(args): device, args.gpu_ids = util.get_available_devices() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") word_mat = util.torch_from_json(args.word_emb_file) char_mat = util.torch_from_json(args.char_emb_file) with open(args.dev_eval_file, 'r') as fh: dev_eval_file = json_load(fh) print("Building model...") train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_dataset = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_dataset = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) lr = args.lr base_lr = 1 lr_warm_up_num = args.lr_warm_up_num model = QaNet(word_mat, char_mat, args.connector_dim, args.glove_dim, args.char_dim, args.drop_prob, args.dropout_char, args.num_heads).to(device) ema = util.EMA(model, args.ema_decay) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=base_lr, betas=(0.9, 0.999), eps=1e-7, weight_decay=5e-8, params=parameters) cr = lr / math.log2(lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < lr_warm_up_num else lr) best_f1 = 0 best_em = 0 patience = 0 unused = False for iter in range(args.num_epochs): train(model, optimizer, scheduler, train_dataset, dev_dataset, dev_eval_file, iter, ema, device) ema.assign(model) metrics = test(model, dev_dataset, dev_eval_file, (iter + 1) * len(train_dataset)) dev_f1 = metrics["f1"] dev_em = metrics["exact_match"] if dev_f1 < best_f1 and dev_em < best_em: patience += 1 if patience > args.early_stop: break else: patience = 0 best_f1 = max(best_f1, dev_f1) best_em = max(best_em, dev_em) fn = os.path.join(args.save_dir, "model.pt") torch.save(model, fn) ema.resume(model)
def main(args): args.save_dir = util.get_save_dir(args.save_dir, "exp1_training", training=False) log = get_logger(args.logging_dir, "exp1_training") log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, c.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') dataset = SQuAD(args.test_record_file, True) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.datasplit} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission with open(args.test_eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, c.max_ans_len, True) # Log info progress_bar.update(batch_size) # Not using the unlabeled test set # if args.split != 'test': # # No labels for the test set, so NLL would be invalid # progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), True) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) results = util.eval_dicts(gold_dict, pred_dict, True) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.datasplit} {results_str}') # Log to TensorBoard tbx = SummaryWriter(c.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.test_eval_file, step=0, split=args.datasplit, num_visuals=args.num_visuals)
def create_training_function(args, experiment_save_dir, k_fold_spits=None): device, args.gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(args.gpu_ids)) word_vectors, char_vectors = train.load_embeddings(args) training_dataset = util.SQuAD( util.preprocessed_path(args.train_record_file, args.data_dir, args.dataset), args.use_squad_v2) eval_dataset = util.SQuAD( util.preprocessed_path(args.dev_record_file, args.data_dir, args.dataset), args.use_squad_v2) train_gold_dict = util.load_eval_file(args, args.train_eval_file) eval_gold_dict = util.load_eval_file(args, args.dev_eval_file) k_fold_spits = args.k_fold min_nll_decrease = args.min_nll_decrease def process_sample(sample, model, gold_dict=None): cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids = sample batch_size = cw_idxs.size(0) log_p1, log_p2 = model(cw_idxs.to(device), cc_idxs.to(device), qw_idxs.to(device), qc_idxs.to(device)) y1, y2 = y1.to(device), y2.to(device) nll_loss_1 = F.nll_loss(log_p1, y1) nll_loss_2 = F.nll_loss(log_p2, y2) loss = nll_loss_1 + nll_loss_2 preds = None if gold_dict: p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) return loss, batch_size, preds def run_experiment(tbx, train_loader, train_size, eval_loader, eval_size, gold_dict, config): from models import init_training max_grad_norm = args.max_grad_norm model, optimizer, scheduler, ema, step = init_training( args, word_vectors, char_vectors, device, config) prev_epoch_avg_nll = None for epoch in range(step, args.num_epochs): model.train() epoch_avg_nll = util.AverageMeter() with torch.enable_grad(), tqdm(total=train_size) as progress_bar: for sample in train_loader: loss, batch_size, _ = process_sample(sample, model, None) nll = loss.item() epoch_avg_nll.update(nll) tbx.add_scalar('train/NLL', loss.item(), step) current_lr = optimizer.param_groups[0]['lr'] tbx.add_scalar('train/LR', current_lr, step) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, STEP=util.millify(step), LR=current_lr, NLL=nll) step += batch_size model.eval() ema.assign(model) results, pred_dict = evaluate(model, eval_loader, eval_size, gold_dict) ema.resume(model) tbx.add_scalar('eval/NLL', results['NLL'], step) if 'AvNA' in results: tbx.add_scalar('eval/AvNA', results['AvNA'], step) tbx.add_scalar('eval/F1', results['F1'], step) tbx.add_scalar('eval/EM', results['EM'], step) dev_eval_file = util.preprocessed_path(args.dev_eval_file, args.data_dir, args.dataset) util.visualize(tbx, pred_dict=pred_dict, eval_dict=gold_dict, step=step, split='eval', num_visuals=args.num_visuals) if ((min_nll_decrease is not None) and (prev_epoch_avg_nll is not None) and (epoch_avg_nll.avg > prev_epoch_avg_nll - min_nll_decrease)): print( f"Avg NLL {epoch_avg_nll.avg:.2f} > {prev_epoch_avg_nll:.2f} - {(min_nll_decrease):.2f}. Break" ) break prev_epoch_avg_nll = epoch_avg_nll.avg return model, step def evaluate(model, eval_loader, eval_size, gold_dict): pred_dict = {} with torch.no_grad(), tqdm(total=eval_size) as progress_bar: nll_meter = util.AverageMeter() for sample in eval_loader: loss, batch_size, preds = process_sample( sample, model, gold_dict) nll_meter.update(loss.item(), batch_size) pred_dict.update(preds) progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) results = { **util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2), **{ 'NLL': nll_meter.avg } } return results, pred_dict def kfold_training_function(experiment, config): avg_meter = util.MultiAverageMeter(['F1', 'EM', 'AvNA', 'NLL']) gold_dict = train_gold_dict for fold_index, train_loader, train_size, test_loader, test_size in kfold_generator( args, k_fold_spits, training_dataset): save_dir = os.path.join(experiment_save_dir, *GridSearch.experiment_path(experiment), f"fold={fold_index + 1}") tbx = SummaryWriter(save_dir) model, steps = run_experiment(tbx, train_loader, train_size, test_loader, test_size, gold_dict, config) results, _ = evaluate(model, test_loader, test_size, gold_dict) avg_meter.update(results, steps) return {**experiment, **avg_meter.avg} def training_function(experiment, config): import torch.utils.data as data train_loader = data.DataLoader(training_dataset, shuffle=True, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=None) eval_loader = data.DataLoader(eval_dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=None) save_dir = os.path.join(experiment_save_dir, *GridSearch.experiment_path(experiment)) tbx = SummaryWriter(save_dir) train_size = len(training_dataset) eval_size = len(eval_dataset) model, steps = run_experiment(tbx, train_loader, train_size, eval_loader, eval_size, eval_gold_dict, config) results, _ = evaluate(model, eval_loader, eval_size, eval_gold_dict) return {**experiment, **results} return kfold_training_function if k_fold_spits is not None else training_function
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') nbr_model = 0 if (args.load_path_baseline): model_baseline = Baseline(word_vectors=word_vectors, hidden_size=100) model_baseline = nn.DataParallel(model_baseline, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_baseline}...') model_baseline = util.load_model(model_baseline, args.load_path_baseline, gpu_ids, return_step=False) model_baseline = model_baseline.to(device) model_baseline.eval() nll_meter_baseline = util.AverageMeter() nbr_model += 1 save_prob_baseline_start = [] save_prob_baseline_end = [] if (args.load_path_bidaf): model_bidaf = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size) model_bidaf = nn.DataParallel(model_bidaf, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_bidaf}...') model_bidaf = util.load_model(model_bidaf, args.load_path_bidaf, gpu_ids, return_step=False) model_bidaf = model_bidaf.to(device) model_bidaf.eval() nll_meter_bidaf = util.AverageMeter() nbr_model += 1 save_prob_bidaf_start = [] save_prob_bidaf_end = [] if (args.load_path_bidaf_fusion): model_bidaf_fu = BiDAF_fus(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size) model_bidaf_fu = nn.DataParallel(model_bidaf_fu, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_bidaf_fusion}...') model_bidaf_fu = util.load_model(model_bidaf_fu, args.load_path_bidaf_fusion, gpu_ids, return_step=False) model_bidaf_fu = model_bidaf_fu.to(device) model_bidaf_fu.eval() nll_meter_bidaf_fu = util.AverageMeter() nbr_model += 1 save_prob_bidaf_fu_start = [] save_prob_bidaf_fu_end = [] if (args.load_path_qanet): model_qanet = QANet(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet = nn.DataParallel(model_qanet, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet}...') model_qanet = util.load_model(model_qanet, args.load_path_qanet, gpu_ids, return_step=False) model_qanet = model_qanet.to(device) model_qanet.eval() nll_meter_qanet = util.AverageMeter() nbr_model += 1 save_prob_qanet_start = [] save_prob_qanet_end = [] if (args.load_path_qanet_old): model_qanet_old = QANet_old(word_vectors=word_vectors, char_vectors=char_vectors, device=device, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks) model_qanet_old = nn.DataParallel(model_qanet_old, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_old}...') model_qanet_old = util.load_model(model_qanet_old, args.load_path_qanet_old, gpu_ids, return_step=False) model_qanet_old = model_qanet_old.to(device) model_qanet_old.eval() nll_meter_qanet_old = util.AverageMeter() nbr_model += 1 save_prob_qanet_old_start = [] save_prob_qanet_old_end = [] if (args.load_path_qanet_inde): model_qanet_inde = QANet_independant_encoder( word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet_inde = nn.DataParallel(model_qanet_inde, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_inde}...') model_qanet_inde = util.load_model(model_qanet_inde, args.load_path_qanet_inde, gpu_ids, return_step=False) model_qanet_inde = model_qanet_inde.to(device) model_qanet_inde.eval() nll_meter_qanet_inde = util.AverageMeter() nbr_model += 1 save_prob_qanet_inde_start = [] save_prob_qanet_inde_end = [] if (args.load_path_qanet_s_e): model_qanet_s_e = QANet_S_E(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet_s_e = nn.DataParallel(model_qanet_s_e, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_s_e}...') model_qanet_s_e = util.load_model(model_qanet_s_e, args.load_path_qanet_s_e, gpu_ids, return_step=False) model_qanet_s_e = model_qanet_s_e.to(device) model_qanet_s_e.eval() nll_meter_qanet_s_e = util.AverageMeter() nbr_model += 1 save_prob_qanet_s_e_start = [] save_prob_qanet_s_e_end = [] # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) y1, y2 = y1.to(device), y2.to(device) l_p1, l_p2 = [], [] # Forward if (args.load_path_baseline): log_p1_baseline, log_p2_baseline = model_baseline( cw_idxs, cc_idxs) loss_baseline = F.nll_loss(log_p1_baseline, y1) + F.nll_loss( log_p2_baseline, y2) nll_meter_baseline.update(loss_baseline.item(), batch_size) l_p1 += [log_p1_baseline.exp()] l_p2 += [log_p2_baseline.exp()] if (args.save_probabilities): save_prob_baseline_start += [ log_p1_baseline.exp().detach().cpu().numpy() ] save_prob_baseline_end += [ log_p2_baseline.exp().detach().cpu().numpy() ] if (args.load_path_qanet): log_p1_qanet, log_p2_qanet = model_qanet( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet = F.nll_loss(log_p1_qanet, y1) + F.nll_loss( log_p2_qanet, y2) nll_meter_qanet.update(loss_qanet.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet.exp()] l_p2 += [log_p2_qanet.exp()] if (args.save_probabilities): save_prob_qanet_start += [ log_p1_qanet.exp().detach().cpu().numpy() ] save_prob_qanet_end += [ log_p2_qanet.exp().detach().cpu().numpy() ] if (args.load_path_qanet_old): log_p1_qanet_old, log_p2_qanet_old = model_qanet_old( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_old = F.nll_loss(log_p1_qanet_old, y1) + F.nll_loss( log_p2_qanet_old, y2) nll_meter_qanet_old.update(loss_qanet_old.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_old.exp()] l_p2 += [log_p2_qanet_old.exp()] if (args.save_probabilities): save_prob_qanet_old_start += [ log_p1_qanet_old.exp().detach().cpu().numpy() ] save_prob_qanet_old_end += [ log_p2_qanet_old.exp().detach().cpu().numpy() ] if (args.load_path_qanet_inde): log_p1_qanet_inde, log_p2_qanet_inde = model_qanet_inde( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_inde = F.nll_loss( log_p1_qanet_inde, y1) + F.nll_loss(log_p2_qanet_inde, y2) nll_meter_qanet_inde.update(loss_qanet_inde.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_inde.exp()] l_p2 += [log_p2_qanet_inde.exp()] if (args.save_probabilities): save_prob_qanet_inde_start += [ log_p1_qanet_inde.exp().detach().cpu().numpy() ] save_prob_qanet_inde_end += [ log_p2_qanet_inde.exp().detach().cpu().numpy() ] if (args.load_path_qanet_s_e): log_p1_qanet_s_e, log_p2_qanet_s_e = model_qanet_s_e( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_s_e = F.nll_loss(log_p1_qanet_s_e, y1) + F.nll_loss( log_p2_qanet_s_e, y2) nll_meter_qanet_s_e.update(loss_qanet_s_e.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_s_e.exp()] l_p2 += [log_p2_qanet_s_e.exp()] if (args.save_probabilities): save_prob_qanet_s_e_start += [ log_p1_qanet_s_e.exp().detach().cpu().numpy() ] save_prob_qanet_s_e_end += [ log_p2_qanet_s_e.exp().detach().cpu().numpy() ] if (args.load_path_bidaf): log_p1_bidaf, log_p2_bidaf = model_bidaf( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_bidaf = F.nll_loss(log_p1_bidaf, y1) + F.nll_loss( log_p2_bidaf, y2) nll_meter_bidaf.update(loss_bidaf.item(), batch_size) l_p1 += [log_p1_bidaf.exp()] l_p2 += [log_p2_bidaf.exp()] if (args.save_probabilities): save_prob_bidaf_start += [ log_p1_bidaf.exp().detach().cpu().numpy() ] save_prob_bidaf_end += [ log_p2_bidaf.exp().detach().cpu().numpy() ] if (args.load_path_bidaf_fusion): log_p1_bidaf_fu, log_p2_bidaf_fu = model_bidaf_fu( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_bidaf_fu = F.nll_loss(log_p1_bidaf_fu, y1) + F.nll_loss( log_p2_bidaf_fu, y2) nll_meter_bidaf_fu.update(loss_bidaf_fu.item(), batch_size) l_p1 += [log_p1_bidaf_fu.exp()] l_p2 += [log_p2_bidaf_fu.exp()] if (args.save_probabilities): save_prob_bidaf_fu_start += [ log_p1_bidaf_fu.exp().detach().cpu().numpy() ] save_prob_bidaf_fu_end += [ log_p2_bidaf_fu.exp().detach().cpu().numpy() ] p1, p2 = l_p1[0], l_p2[0] for i in range(1, nbr_model): p1 += l_p1[i] p2 += l_p2[i] p1 /= nbr_model p2 /= nbr_model starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid if (args.load_path_qanet): progress_bar.set_postfix(NLL=nll_meter_qanet.avg) elif (args.load_path_bidaf): progress_bar.set_postfix(NLL=nll_meter_bidaf.avg) elif (args.load_path_bidaf_fusion): progress_bar.set_postfix(NLL=nll_meter_bidaf_fu.avg) elif (args.load_path_qanet_old): progress_bar.set_postfix(NLL=nll_meter_qanet_old.avg) elif (args.load_path_qanet_inde): progress_bar.set_postfix(NLL=nll_meter_qanet_inde.avg) elif (args.load_path_qanet_s_e): progress_bar.set_postfix(NLL=nll_meter_qanet_s_e.avg) else: progress_bar.set_postfix(NLL=nll_meter_baseline.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) if (args.save_probabilities): if (args.load_path_baseline): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_baseline_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_baseline_end, fp) if (args.load_path_bidaf): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_end, fp) if (args.load_path_bidaf_fusion): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_fu_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_fu_end, fp) if (args.load_path_qanet): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_end, fp) if (args.load_path_qanet_old): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_old_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_old_end, fp) if (args.load_path_qanet_inde): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_inde_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_inde_end, fp) if (args.load_path_qanet_s_e): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_s_e_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_s_e_end, fp) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) if (args.load_path_qanet): meter_avg = nll_meter_qanet.avg elif (args.load_path_bidaf): meter_avg = nll_meter_bidaf.avg elif (args.load_path_bidaf_fusion): meter_avg = nll_meter_bidaf_fu.avg elif (args.load_path_qanet_inde): meter_avg = nll_meter_qanet_inde.avg elif (args.load_path_qanet_s_e): meter_avg = nll_meter_qanet_s_e.avg elif (args.load_path_qanet_old): meter_avg = nll_meter_qanet_old.avg else: meter_avg = nll_meter_baseline.avg results_list = [('NLL', meter_avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings # Args: word_vectors: word vector tensor of dimension [vocab_size * wemb_dim] log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get Model log.info('Building Model...') model = QANet(word_vectors, char_vectors, args.para_limit, args.ques_limit, args.f_model, num_head=args.num_head, train_cemb = (not args.pretrained_char)) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam( params=parameters, lr=args.lr, betas=(args.beta1, args.beta2), eps=1e-8, weight_decay=3e-7) cr = 1.0 / math.log(args.lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else 1) loss_f = torch.nn.CrossEntropyLoss() # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2)) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def train(args): args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) if args.gpu_ids == 'cpu': device, args.gpu_ids = torch.device('cpu'), [] else: device, args.gpu_ids = util.get_available_devices() log.info('training on device {} with gpu_id {}'.format(str(device), str(args.gpu_ids))) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) log.info('Building model...') if args.task == 'tag': model = SummarizerLinear() # model = SummarizerLinearAttended(128, 256) # model = SummarizerRNN(128, 256) else: model = SummarizerAbstractive(128, 256, device) if len(args.gpu_ids) > 0: model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ## get a saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.l2_wd) log.info('Building dataset...') data_path = PROCESSED_DATA_SUPER_TINY if args.split == 'super_tiny' else PROCESSED_DATA with open(data_path, 'rb') as f: all_data = pickle.load(f) if 'tiny' in args.split: train_split = all_data['tiny'] dev_split = all_data['tiny'] else: train_split = all_data['train'] dev_split = all_data['dev'] train_dataset = SummarizationDataset( train_split['X'], train_split['y'], train_split['gold']) dev_dataset = SummarizationDataset( dev_split['X'], dev_split['y'], dev_split['gold']) collate_fn = tag_collate_fn if args.task == 'tag' else decode_collate_fn train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=collate_fn) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, collate_fn=collate_fn) ## Train! log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) batch_num = 0 with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for X, y, _ in train_loader: batch_size = X.size(0) batch_num += 1 X = X.to(device) y = y.float().to(device) # (batch_size, max_len) for tag, (batch_size, 110) for decode optimizer.zero_grad() if args.task == 'tag': logits = model(X) # (batch_size, max_len) mask = (X != PAD_VALUE).float() # 1 for real data, 0 for pad, size of (batch_size, max_len) loss = (F.binary_cross_entropy_with_logits(logits, y, reduction='none') * mask).mean() loss_val = loss.item() else: logits = model(X, y[:, :-1]) # (batch_size, 109, max_len) loss = sum(F.cross_entropy(logits[i], y[i, 1:], ignore_index=-1, reduction='mean')\ for i in range(batch_size)) / batch_size loss_val = loss.item() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() # scheduler.step(step // batch_size) # Log info step += args.batch_size progress_bar.update(args.batch_size) progress_bar.set_postfix(epoch=epoch, Loss=loss_val) tbx.add_scalar('train/Loss', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) results, pred_dict = evaluate(args, model, dev_loader, device) if results is None: log.info('Selected predicted no select for all in batch') continue saver.save(step, model, results[args.metric_name], device) # # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step)
import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.optim.lr_scheduler as sched import torch.utils.data as data from torch.utils.data import DataLoader, TensorDataset from torch import Tensor if __name__ == '__main__': args = get_test_args() # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, subdir='test') log = util.get_logger(args.save_dir, args.name) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') # Load the checkpoint if given as parameter if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(args.load_path) else: # Get model log.info('Building model...') model = util.get_model_class(args.model)(args) # Load the reward model
def main(data, flags): # Set up logging and devices log_dir = data.logging_dir log = util.get_logger(log_dir, "toy") tbx = SummaryWriter(data.logging_dir) device, data.gpu_ids = util.get_available_devices() log.info('Config: {}'.format(dumps(vars(data), indent=4, sort_keys=True))) data.batch_size *= max(1, len(data.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(data.random_seed)) random.seed(data.random_seed) np.random.seed(data.random_seed) torch.manual_seed(data.random_seed) torch.cuda.manual_seed_all(data.random_seed) if flags[1] == "toy": word_emb_file = data.toy_word_emb_file training_data = data.toy_record_file_exp3 test_data = data.dev_record_file_exp3 eval_file = data.toy_eval_exp3 elif flags[1] == "train": word_emb_file = data.word_emb_file training_data = data.train_record_file_exp3 test_data = data.dev_record_file_exp3 eval_file = data.train_eval_exp3 elif flags[1] == "dev": word_emb_file = data.word_emb_file training_data = data.dev_record_file_exp3 test_data = data.toy_record_file_exp3 eval_file = data.dev_eval_exp3 # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=data.hidden_size, drop_prob=data.drop_prob) model = nn.DataParallel(model, data.gpu_ids) if data.load_path: log.info('Loading checkpoint from {}...'.format(data.load_path)) model, step = util.load_model(model, data.load_path, data.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, data.ema_decay) # Get saver saver = util.CheckpointSaver(data.logging_dir, max_checkpoints=10, metric_name=data.metric_name, maximize_metric=data.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), data.learning_rate, weight_decay=data.learning_weight_decay) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') # np.load(data.toy_record_file_exp3) train_dataset = SQuAD3(training_data, use_v2=True) train_loader = torchdata.DataLoader(train_dataset, batch_size=data.batch_size, shuffle=True, num_workers=data.num_workers, collate_fn=collate_fn) test_dataset = SQuAD3(test_data, use_v2=True) test_loader = torchdata.DataLoader(test_dataset, batch_size=data.batch_size, shuffle=False, num_workers=data.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = data.eval_steps epoch = step // len(test_dataset) while epoch != data.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log.info("cw_idxs length: {}".format(str(len(cw_idxs)))) log.info("qw_idxs length: {}".format(str(len(qw_idxs)))) log.info("cw_idxs size: {}".format(str( sys.getsizeof(cw_idxs)))) log.info("qw_idxs size: {}".format(str( sys.getsizeof(qw_idxs)))) log.info("cw_idxs shape: {}".format(str(cw_idxs.shape))) log.info("qw_idxs shape: {}".format(str(qw_idxs.shape))) log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), data.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('toy/NLL', loss_val, step) tbx.add_scalar('toy/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = data.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, test_loader, device, eval_path=eval_file, max_len=sys.maxsize, use_squad_v2=True) saver.save(step, model, results[data.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=step, split='dev', num_visuals=data.num_visuals)
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) #log = util.get_logger(args.save_dir, args.name) #log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings print('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model print('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) print(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader print('Building dataset...') #record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD("./data/my_test.npz", args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate print(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission #eval_file = vars(args)[f'{args.split}_eval_file'] with open("./data/my_test_eval.json", 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: print("viewing the dataset") print(cw_idxs, cc_idxs, qw_idxs, qc_idxs) # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) #if args.split != 'test': # No labels for the test set, so NLL would be invalid #progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) print("my evaluation ....") for el in pred_dict: print(el, pred_dict[el]) for el in sub_dict: print(el, sub_dict[el])
def main(args): if args.large: args.train_record_file += '_large' args.dev_eval_file += '_large' model_name = "albert-xlarge-v2" else: model_name = "albert-base-v2" if args.xxlarge: args.train_record_file += '_xxlarge' args.dev_eval_file += '_xxlarge' model_name = "albert-xxlarge-v2" # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get model log.info('Building model...') if args.bidaf: char_vectors = util.torch_from_json(args.char_emb_file) if args.model_name == 'albert_highway': model = models.albert_highway(model_name) elif args.model_name == 'albert_lstm_highway': model = models.LSTM_highway(model_name, hidden_size=args.hidden_size) elif args.model_name == 'albert_bidaf': model = models.BiDAF(char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.model_name == 'albert_bidaf2': model = models.BiDAF2(model_name=model_name, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) else: model = AlbertForQuestionAnswering.from_pretrained(args.model_name) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2, args.bidaf) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) dev_dataset = SQuAD(args.dev_eval_file, args.use_squad_v2, args.bidaf) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) with open(args.dev_gold_file) as f: gold_dict = json.load(f) tokenizer = AlbertTokenizer.from_pretrained(model_name) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for batch in train_loader: batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], 'start_positions': batch[3], 'end_positions': batch[4], } if args.bidaf: inputs['char_ids'] = batch[6] y1 = batch[3] y2 = batch[4] # Setup for forward batch_size = inputs["input_ids"].size(0) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(**inputs) y1, y2 = y1.to(device), y2.to(device) outputs = model(**inputs) loss = outputs[0] loss = loss.mean() # loss_fct = nn.CrossEntropyLoss() # loss = loss_fct(log_p1, y1) + loss_fct(log_p2, y2) # loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(args, model, dev_dataset, dev_loader, gold_dict, tokenizer, device, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step)
def test_model(questions, context, use_squad_v2=True, model_path="../save/training-02/best.pth.tar"): # Set up logging #args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) #log = util.get_logger(args.save_dir, args.name) #log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') #args = get_test_args() device, gpu_ids = util.get_available_devices() batch_size = 64 * max(1, len(gpu_ids)) # Get embeddings #print('Loading embeddings...') word_vectors = util.torch_from_json('../data/word_emb.json') # Get model #print('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=100) model = nn.DataParallel(model, gpu_ids) #model_path = "../save/training-02/best.pth.tar" #print(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, model_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader #print('Building dataset...') #record_file = vars(args)[f'{args.split}_record_file'] # my code start here # this is a simple approch when dealing with the user date # according to your approch of creating the interface you can change this code # and also you have to check the function "process_file" in the setup.py file processed_questions = [] for index, question in enumerate(questions): processed_question = { "question": question, "id": index, "answers": [{ "answer_start": 0, "text": "never mind" }] } processed_questions.append(processed_question) source = {"paragraphs": [{"qas": processed_questions, "context": context}]} word_counter, char_counter = Counter(), Counter() with open("../data/word2idx.json", "r") as f1: word2idx_dict = json.load(f1) with open("../data/char2idx.json", "r") as f2: char2idx_dict = json.load(f2) my_test_examples, my_test_eval = process_file(source, "my_test", word_counter, char_counter) npz = build_features(my_test_examples, "my_test", word2idx_dict, char2idx_dict, is_test=True) #my code end here dataset = SQuAD(npz, use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn) # Evaluate #print(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission #eval_file = vars(args)[f'{args.split}_eval_file'] gold_dict = my_test_eval #print("gold_dict", gold_dict) #print("data_loader", data_loader) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, 15, use_squad_v2) print("starts ", starts, " ends ", ends) # Log info progress_bar.update(batch_size) #if args.split != 'test': # No labels for the test set, so NLL would be invalid #progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) #print("my evaluation ....") #for el in pred_dict: #print(el, pred_dict[el]) #for el in sub_dict: #print(el, sub_dict[el]) return pred_dict
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) # Comment out to only use 1 GPU on nv12 args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = None max_context_len, max_question_len = args.para_limit, args.ques_limit if (args.model_type == "bidaf" or args.model_type == "bert-bidaf"): model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif (args.model_type == "dcn" or args.model_type == "bert-dcn"): model = DCN(word_vectors=word_vectors, hidden_size=args.hidden_size, max_context_len=max_context_len, max_question_len=max_question_len, drop_prob=args.drop_prob) elif (args.model_type == "bert-basic"): model = BERT(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) if model is None: raise ValueError('Model is unassigned. Please ensure --model_type \ chooses between {bidaf, bert-bidaf, dcn, bert-dcn, bert-basic} ') model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) count_skip = 0 while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: batch_size = cw_idxs.size(0) count_skip += 1 if (args.skip_examples == True and (count_skip % 5 == 1 or count_skip % 5 == 2 or count_skip % 5 == 3 or count_skip % 5 == 4)): step += batch_size progress_bar.update(batch_size) steps_till_eval -= batch_size continue # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() ## Additions for BERT ## max_context_len, max_question_len = args.para_limit, args.ques_limit if "bert" in args.model_type: bert_train_embeddings = get_embeddings( "train", ids, args.para_limit, args.ques_limit) else: bert_train_embeddings = None # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_train_embeddings, \ max_context_len, max_question_len, device) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, args) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices (unchanged from train.py) args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) # train only, not in test device, args.gpu_ids = util.get_available_devices() # todo(small): should this be args (compare test_para) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # args.py: default size is 64 # Set random seed (unchanged) - train only log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Prepare BiDAF model (must already trained) log.info('Building BiDAF model (should be pretrained)') bidaf_model = BiDAF(word_vectors=word_vectors, # todo: these word vectors shouldn't matter? hidden_size=args.hidden_size) # since they will be loaded in during load_model? #drop_prob=args.drop_prob) # no drop probability since we are not training bidaf_model = nn.DataParallel(bidaf_model, args.gpu_ids) if args.short_test: args.hidden_size = 5 elif not args.load_path: log.info("Trying to trian paraphraser withou bidaf model. " "First train BiDAF and then specify the load path. Exiting") exit(1) else: log.info(f'Loading checkpoint from {args.load_path}...') bidaf_model = util.load_model(bidaf_model, args.load_path, args.gpu_ids, return_step=False) # don't need step since we aren't training bidaf_model = bidaf_model.to(device) bidaf_model.eval() # we eval only (vs train) # todo: Setup the Paraphraser model paraphaser_model = Paraphraser(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) # Get data loader log.info('Building dataset...') # New for paraphrase: squad_paraphrase has extra fields train_dataset = SQuAD_paraphrase(args.train_record_file, args.use_squad_v2) # train.npz (from setup.py, build_features()) train_loader = data.DataLoader(train_dataset, # this dataloader used for all epoch iteration batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn_para) dev_dataset = SQuAD_paraphrase(args.dev_record_file, args.use_squad_v2) # dev.npz (same as above) dev_loader = data.DataLoader(dev_dataset, # dev.npz used in evaluate() fcn batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn_para) # todo: this is just for looking at the paraphrases idx2word_dict = load(args.idx2word_file) #Get saver # saver = util.CheckpointSaver(args.save_dir, # max_checkpoints=args.max_checkpoints, # metric_name=args.metric_name, # maximize_metric=args.maximize_metric, # log=log) #Get optimizer and scheduler # ema = util.EMA(paraphaser_model, args.ema_decay) # optimizer = optim.Adadelta(paraphaser_model.parameters(), args.lr, # weight_decay=args.l2_wd) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Train step = 0 log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, cphr_idxs, qphr_idxs, qphr_types, ids in train_loader: # Setup for forward # note that cc_idxs, qc_idxs are not used! (character indices) cw_idxs = cw_idxs.to(device) # todo what does this actually do qw_idxs = qw_idxs.to(device) cphr_idxs = cphr_idxs.to(device) qphr_idxs = qphr_idxs.to(device) qphr_types = qphr_types.to(device) batch_size = cw_idxs.size(0) # if args.short_test: # print(f'batch size: {batch_size}') # for i, type in enumerate(cphr_idxs[0]): # print(f'type: {i}') # pp(type) # for x in (qphr_idxs[0], qphr_types[0]): # pp(x) # return paraphrased = paraphaser_model(qphr_idxs, qphr_types, cphr_idxs) for idx, p in enumerate(paraphrased): # enumerate over batch_size non_zeros = p[p.nonzero()].squeeze() #paraphrased[idx] = non_zeros sentence_as_list = [idx2word_dict[str(w.item())] for w in non_zeros] pp(" ".join(sentence_as_list)) #pp([idx2word_dict[w] for w in non_zeros]) if args.short_test: return optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) # // is floor division ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, # call eval with dev_loader args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') ''' model = QANet(word_vectors, args.hidden_size, args.char_embed_size, args.word_from_char_size, args.dropout_main, args.embed_encoder_num_convs, args.embed_encoder_conv_kernel_size, args.embed_encoder_num_heads, args.embed_encoder_num_blocks, args.model_encoder_num_convs, args.model_encoder_conv_kernel_size, args.model_encoder_num_heads, args.model_encoder_num_blocks) ''' char_vectors = util.torch_from_json(args.char_emb_file) model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(): args = get_train_args() args.save_dir = util.get_save_dir(args.save_dir, args.bert_model, training=True) tbx = SummaryWriter(args.save_dir) device, gpu_ids = util.get_available_devices() # Set random seed logger.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("device: {} n_gpu: {}, 16-bits training: {}".format( device, len(gpu_ids), args.fp16)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.load_path: output_model_file = os.path.join(args.load_path, WEIGHTS_NAME) output_config_file = os.path.join(args.load_path, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForNQ(config) model.load_state_dict(torch.load(output_model_file), strict=False) else: model = BertForNQ.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(-1))) logger.info(model.config) if args.fp16: model.half() model.to(device) if len(gpu_ids) > 1: model = torch.nn.DataParallel(model) with open(args.train_file, "rb") as reader: train_features = pickle.load(reader) num_train_optimization_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) all_ans_types = torch.tensor([f.ans_type for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_ans_types) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if len(gpu_ids) == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, ans_types = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, ans_types) if len(gpu_ids) > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() tbx.add_scalar('train/NLL', loss.item(), global_step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], global_step) global_step += 1 if global_step % 5000 == 0: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self out_dir = os.path.join(args.output_dir, str(global_step // 5000)) if not os.path.exists(out_dir): os.makedirs(out_dir) output_model_file = os.path.join(out_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(out_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) # // is floor division ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = BiDAF(vectors=(word_vectors, char_vectors), hidden_size=args.hidden_size, drop_prob=args.drop_prob, p_sdd=args.p_sdd, char_limit=args.char_limit, use_transformer=args.use_transformer, inter_size=args.inter_size, heads=args.heads, c2w_size=args.c2w_size, enc_blocks=args.enc_blocks, enc_convs=args.enc_convs, mod_blocks=args.mod_blocks, mod_convs=args.mod_convs, use_GRU=args.use_GRU) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) # uses the saved step num else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler # optimizer = optim.Adadelta(model.parameters(), args.lr, # weight_decay=args.l2_wd) # The scheduler MULTIPLIES the base LR, NOT replaces optimizer = optim.Adam(model.parameters(), 1., betas=(.9, .98), eps=1e-9, weight_decay=args.l2_wd) scheduler = sched.LambdaLR( optimizer, lambda s: 0.001 * math.log(s + 1) / math.log(1000 - 1) if s < 1000 else 0.001) # Chute (must use math.log, else TypeError) # scheduler = sched.LambdaLR(optimizer, lambda s: (args.hidden_size**(-.5)) * # min((s+1e-9)**(-.5), s*(4000**(-1.5))) # ) # From Vaswani et. al 2017 # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward optimizer.zero_grad() batch_size = cw_idxs.size(0) cc_idxs = cc_idxs.to(device) # (batch, c_limit, char_limit) qc_idxs = qc_idxs.to(device) cw_idxs = cw_idxs.to(device) # (batch, c_limit) qw_idxs = qw_idxs.to(device) c_idxs, q_idxs = (cw_idxs, cc_idxs), (qw_idxs, qc_idxs) # Forward log_p1, log_p2 = model(c_idxs, q_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step( step // batch_size ) # By default, schedules per epoch; pass in step # as "epoch" ema(model, step // batch_size) # Log info step += batch_size # Number of examples. Step is usually the number of (mini)-batches progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Load embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Build QA model log.info('Building model...') model = QA_Model(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, attention_type=args.attention_type, train_embeddings=args.train_embeddings) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: # Load QA model from file log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) #optimizer = optim.Adam(model.parameters(), lr=args.lr) # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}')
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, type="train") log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get your model log.info('Building model...') model, step = get_model(log, args) model = model.to(device) model.train() #Exponential moving average ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=[0.8, 0.999], eps=1e-7, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda step: 1) #get loss computer cri = FocalLoss(alpha=torch.tensor([args.alpha, 1]).to(device), gamma=args.gamma) # Get data loader log.info('Building dataset...') dev_dataset = util.load_dataset(args.dev_file, args.PPI_dir, args.PPI_gene_feature_dir, args.PPI_gene_query_dict_dir, args.max_nodes, train=False) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=util.collate_fn) train_dataset = util.load_dataset(args.train_file, args.PPI_dir, args.PPI_gene_feature_dir, args.PPI_gene_query_dict_dir, args.max_nodes) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=util.collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = 0 while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for batch_a, batch_bio_a, batch_A, batch_b, batch_bio_b, batch_B, batch_y in train_loader: # Setup for forward batch_a = batch_a.to(device) batch_bio_a = batch_bio_a.to(device) batch_A = batch_A.to(device) batch_bio_b = batch_bio_b.to(device) batch_b = batch_b.to(device) batch_B = batch_B.to(device) batch_y = batch_y.to(device) batch_y = batch_y.long() batch_size = batch_bio_a.size(0) optimizer.zero_grad() # Forward output = model(batch_a, batch_bio_a, batch_A, batch_b, batch_bio_b, batch_B) loss = cri(output, batch_y) #loss = F.nll_loss(output, batch_y) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/Loss', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results = evaluate(model, dev_loader, cri, device) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.5f}' for k, v in results.items()) log.info(f'Dev {results_str}') log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step)