device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') model.to(device) model.train() print('initialized bert model') train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True) optim = AdamW(model.parameters(), lr=5e-5) print('starting training\n') for epoch in range(3): print('EPOCH', epoch + 1) batch_num = 1 for batch in train_loader: print(' batch', batch_num) optim.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = outputs[0] loss.backward() optim.step() batch_num += 1 print('finished training') model.eval()
class BagRE(nn.Module): def __init__(self, model, train_path, val_path, test_path, ckpt, batch_size=32, max_epoch=100, lr=0.1, weight_decay=1e-5, opt='sgd', bag_size=None, loss_weight=False): super().__init__() self.max_epoch = max_epoch self.bag_size = bag_size # Load data if train_path != None: self.train_loader = BagRELoader(train_path, model.rel2id, model.sentence_encoder.tokenize, batch_size, True, bag_size=bag_size, entpair_as_bag=False) if val_path != None: self.val_loader = BagRELoader(val_path, model.rel2id, model.sentence_encoder.tokenize, batch_size, False, bag_size=None, entpair_as_bag=True) if test_path != None: self.test_loader = BagRELoader(test_path, model.rel2id, model.sentence_encoder.tokenize, batch_size, False, bag_size=None, entpair_as_bag=True) # Model self.model = nn.DataParallel(model) # Criterion if loss_weight: self.criterion = nn.CrossEntropyLoss( weight=self.train_loader.dataset.weight) else: self.criterion = nn.CrossEntropyLoss() # Params and optimizer params = self.model.parameters() self.lr = lr if opt == 'sgd': self.optimizer = optim.SGD(params, lr, weight_decay=weight_decay) elif opt == 'adam': self.optimizer = optim.Adam(params, lr, weight_decay=weight_decay) elif opt == 'adamw': from transformers import AdamW params = list(self.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] grouped_params = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01, 'lr': lr, 'ori_lr': lr }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': lr, 'ori_lr': lr }] self.optimizer = AdamW(grouped_params, correct_bias=False) else: raise Exception( "Invalid optimizer. Must be 'sgd' or 'adam' or 'bert_adam'.") # Cuda if torch.cuda.is_available(): self.cuda() # Ckpt self.ckpt = ckpt def infer(self, bag): """ Args: bag: bag of sentences with the same entity pair [{ 'text' or 'token': ..., 'h': {'pos': [start, end], ...}, 't': {'pos': [start, end], ...} }] Return: (relation, score) """ bag_seq = [] for item in bag: seq = list(self.model.module.sentence_encoder.tokenize(item)) if len(bag_seq) == 0: for data in seq: bag_seq.append([data]) else: for i in range(len(seq)): bag_seq[i].append(seq[i]) for i in range(len(bag_seq)): bag_seq[i] = torch.cat(bag_seq[i], 0) logits = self.model(None, [[0, len(bag)]], *bag_seq, train=False) score, pred = logits.squeeze(0).max(-1) score = score.item() pred = pred.item() rel = self.model.module.id2rel[pred] return (rel, score) def train_model(self): best_auc = 0 for epoch in range(self.max_epoch): # Train self.train() print("=== Epoch %d train ===" % epoch) avg_loss = AverageMeter() avg_acc = AverageMeter() avg_pos_acc = AverageMeter() t = tqdm(self.train_loader) for iter, data in enumerate(t): if torch.cuda.is_available(): for i in range(len(data)): try: data[i] = data[i].cuda() except: pass label = data[0] bag_name = data[1] scope = data[2] args = data[3:] logits = self.model(label, scope, *args, bag_size=self.bag_size) loss = self.criterion(logits, label) score, pred = logits.max(-1) # (B) acc = float((pred == label).long().sum()) / label.size(0) pos_total = (label != 0).long().sum() pos_correct = ((pred == label).long() * (label != 0).long()).sum() if pos_total > 0: pos_acc = float(pos_correct) / float(pos_total) else: pos_acc = 0 # Log avg_loss.update(loss.item(), 1) avg_acc.update(acc, 1) avg_pos_acc.update(pos_acc, 1) t.set_postfix(loss=avg_loss.avg, acc=avg_acc.avg, pos_acc=avg_pos_acc.avg) # Optimize loss.backward() self.optimizer.step() self.optimizer.zero_grad() # Val print("=== Epoch %d val ===" % epoch) result = self.eval_model(self.val_loader) print("auc: %.4f" % result['auc']) print("f1: %.4f" % (result['f1'])) if result['auc'] > best_auc: print("Best ckpt and saved.") torch.save({'state_dict': self.model.module.state_dict()}, self.ckpt) best_auc = result['auc'] print("Best auc on val set: %f" % (best_auc)) def eval_model(self, eval_loader): self.model.eval() with torch.no_grad(): t = tqdm(eval_loader) pred_result = [] for iter, data in enumerate(t): if torch.cuda.is_available(): for i in range(len(data)): try: data[i] = data[i].cuda() except: pass label = data[0] bag_name = data[1] scope = data[2] args = data[3:] logits = self.model(None, scope, *args, train=False) # results after softmax for i in range(logits.size(0)): for relid in range(self.model.module.num_class): if self.model.module.id2rel[relid] != 'NA': pred_result.append({ 'entpair': bag_name[i][:2], 'relation': self.model.module.id2rel[relid], 'score': logits[i][relid].item() }) result = eval_loader.dataset.eval(pred_result) return result def load_state_dict(self, state_dict): self.model.module.load_state_dict(state_dict)
class NewDST(nn.Module): ''' 构造f方法输入: 从前几步继承来的slot、gate、vocab number,词表Vocab 外加三个超参数:词向量维度1024, 隐藏层维度768, 词表长度, 学习率0.001,dropout系数0.1 其他的path表示DST模型过去训练的权重值,可以选择继续训练 ''' def __init__(self, args, Vocab, slots, gating_dict, num_total_steps=0): super(NewDST, self).__init__() self.name = "NewDST" self.id2word = Vocab # dict (index --> token) self.vocab_size = len(self.id2word) self.slots = slots self.gating_dict = gating_dict # type --> id self.nb_gate = len(gating_dict) # 4 self.batch_size = args['batch_size'] self.hidden_size = args['hidden_size'] self.dropout = args['dropout'] # Loss function (reduce='mean') self.loss_func_ptr = nn.CrossEntropyLoss( ignore_index=0) # PAD_token_id: 0 #self.loss_func_ptr = FocalLoss(class_num=self.vocab_size, ignore_index=0) self.loss_func_opr = FocalLoss(class_num=self.nb_gate) # [1, 170, 29] self.loss_func_dom = FocalLoss(class_num=2) #0.15/ 0.2 self.loss_func_cfm = FocalLoss(class_num=len(sub_gating_dict)) # 实例化encoder、decoder self.encoder = EncoderBERT(self.hidden_size, self.slots, self.gating_dict) Word_Embedding = self.encoder.transformer.embeddings.word_embeddings # nn.Embedding对象, 其weight是[30522, 768] self.decoder = Generator(self.id2word, self.vocab_size, self.hidden_size, self.dropout, self.slots, self.gating_dict, Word_Embedding) # 参考save_model方法。这里是把预训练模型导入进来 # 方法: model.load_state_dict((torch.load(dir)).state_dict()) # cpu/gpu相互转换函数map_location:https://www.cnblogs.com/xiaodai0/p/10413711.html path = args["path"] if path: model_file = path + '/checkpoint.pt' print("MODEL {} is Loaded...".format(model_file)) if USE_CUDA: trained_state_dict = torch.load(model_file) else: trained_state_dict = torch.load(model_file, lambda storage, loc: storage) self.load_state_dict(trained_state_dict) else: print("Don't use Pretrained-MODEL.") self.lr = ( 1e-4, 4e-5, ) self.num_total_steps = num_total_steps decoder_params = self.decoder.parameters() #for value in decoder_params: #value.requires_grad = False encoder_params = list( filter(lambda p: id(p) not in list(map(id, decoder_params)), self.parameters())) self.optimizer = AdamW([{ 'params': decoder_params, 'lr': self.lr[0] }, { 'params': encoder_params, 'lr': self.lr[1] }], ) warmup_prop = args['warmup_prop'] # default=0.1 warmup_steps = warmup_prop * self.num_total_steps # 跨epoch学习率控制类:当1个epoch指标还没提升时,学习率下降为初始的0.5倍,最多下降到1e-5 self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=self.num_total_steps) self.reset() if USE_CUDA: self.encoder.cuda() self.decoder.cuda() def optimize(self, clip): self.loss_grad.backward() clip_norm = torch.nn.utils.clip_grad_norm_(self.parameters(), clip) # clip: max_grad_norm self.optimizer.step() self.scheduler.step() def save_model(self, directory): if not os.path.exists(directory): os.makedirs(directory) torch.save(self.state_dict(), directory + '/checkpoint.pt') def reset(self): self.loss, self.print_every, self.loss_ptr, self.loss_gate, self.loss_dom, self.loss_cfm = 0, 1, 0, 0, 0, 0 def print_loss(self): print_loss_avg = self.loss.item() / self.print_every print_loss_ptr = self.loss_ptr / self.print_every print_loss_cfm = self.loss_cfm / self.print_every print_loss_gate = self.loss_gate / self.print_every print_loss_domain = self.loss_dom / self.print_every self.print_every += 1 return ( print_loss_avg, print_loss_ptr, print_loss_gate, print_loss_domain, print_loss_cfm, ) #return 'L:{:.4f},Lptr:{:.4f},Lopr:{:.4f},Ldom:{:.4f}'.format(print_loss_avg,print_loss_ptr,print_loss_gate,print_loss_domain) def encode_and_decode(self, data, use_teacher_forcing, classifiers_pred=None): """input sources""" input_seq = data['input_seq_idx'] input_len = data["input_len"] batch_size, max_input_seq = input_seq.size() previous_gold_y = data['previous_generate_y'] """label""" gold_y_idx = data['generate_y_idx'] gold_y = data['generate_y'] GoldGates = data["gating_label"] GoldDoms = data["domain_focus"] SEP_indices = torch.zeros(batch_size, 2) for bi in range(batch_size): SEP_indices[bi] = torch.tensor( [i for i, x in enumerate(input_seq[bi]) if x == SEP_token_id]) #assert len(SEP_indices) == 2 SEP_indices = SEP_indices.int() # encode encoded_out = self.encoder(input_seq, input_len, max_input_seq, SEP_indices) encoded_outputs, X_hdd, SLOT_hdd_all, predict_gates, predict_dom = encoded_out encoded_cfms = self.encoder.encode_confirm_seq() # prepare the slot gate result(predict_gates --> PredictGates) & get the refines all_slots_refine = torch.zeros(batch_size, len(self.slots)) # as a func return # assert not args['testing_gate_mode'] < args['training_gate_mode'] # if args['training_gate_mode']=0, args['testing_gate_mode']=0 or2 # if args['training_gate_mode']=1, args['testing_gate_mode'] can only =2 PredictGates = torch.zeros(batch_size, len(self.slots)) PredictInts = torch.zeros(batch_size, len(EXPERIMENT_DOMAINS)) if self.decoder.training: # train stage using groundtruth gate all the time PredictGates = GoldGates for bi in range(batch_size): gating_pred = PredictGates[bi, :] focus_dom_label_ = GoldDoms[bi] slots_refine = self.get_slots_refine(focus_dom_label_) all_slots_refine[bi, :] = slots_refine else: # eval stage have 4 types of 'testing_gate_mode' for bi in range(batch_size): gating_pred = torch.argmax( predict_gates.transpose(0, 1)[bi], dim=1) # dim (int) – the dimension to reduce. focus_dom_label_ = torch.argmax(predict_dom.transpose(0, 1)[bi], dim=1) if args['testing_gate_mode'] == 3: # use groundtruth dom-refined tensor focus_dom_label_ = GoldDoms[bi] slots_refine = self.get_slots_refine(focus_dom_label_) all_slots_refine[bi, :] = slots_refine gating_pred_w_d = list( map(lambda x: x[0] * x[1], zip(gating_pred, slots_refine))) gating_pred_w_d = torch.tensor(gating_pred_w_d) #gating_pred_w_d = self._refine(predict_gates.transpose(0, 1)[bi], slots_refine) if args['training_gate_mode'] == 0 and args[ 'testing_gate_mode'] == 0: # Default gates_opr = gating_pred elif args['testing_gate_mode'] == 1: # groundtruth slot gate gates_opr = data["gating_label"][bi, :] else: # use dom-refined slot gate" gates_opr = gating_pred_w_d PredictGates[bi, :] = gates_opr PredictInts[bi, :] = focus_dom_label_ # whether do classfication evaluation if classifiers_pred: all_dom_prediction, all_slot_prediction, all_slot_prediction_w, all_cfm_prediction = classifiers_pred domL_true = GoldDoms[bi].data.tolist() domL_pred = focus_dom_label_.data.tolist() all_dom_prediction['y_true'] += domL_true all_dom_prediction['y_pred'] += domL_pred all_slot_prediction['y_true'] += GoldGates[bi].data.tolist( ) all_slot_prediction['y_pred'] += gating_pred.data.tolist() all_slot_prediction_w['y_true'] += GoldGates[ bi].data.tolist() all_slot_prediction_w[ 'y_pred'] += gating_pred_w_d.data.tolist() if args["genSample"]: if not torch.equal(GoldDoms[bi], focus_dom_label_): tosave_d = "Dialog:{0},turn:{1}\nTrue:{2}\nPred:{3}\n".format( data["ID"][bi], data["turn_id"][bi], data["domain_focus"][bi], focus_dom_label_) with open('save/dom-gate-error.txt', 'a+') as f: f.write(tosave_d + '\n') if not torch.equal(GoldGates[bi], gating_pred): tosave_s = "Dialog:{0},turn:{1}\nTrue:{2}\nPred:{3}\n".format( data["ID"][bi], data["turn_id"][bi], data["gating_label"][bi], gating_pred) with open('save/slot-gate-error.txt', 'a+') as f: f.write(tosave_s + '\n') if not torch.equal(GoldGates[bi], gating_pred_w_d): tosave_sr = "Dialog:{0},turn:{1}\nTrue:{2}\nPred:{3}\n".format( data["ID"][bi], data["turn_id"][bi], data["gating_label"][bi], gating_pred_w_d) with open('save/slot-gate-REFINED-error.txt', 'a+') as f: f.write(tosave_sr + '\n') # decode decoded_out = self.decoder(previous_gold_y, gold_y, gold_y_idx, input_seq, input_len, \ encoded_outputs, X_hdd, SLOT_hdd_all, PredictGates, \ GoldGates, use_teacher_forcing, encoded_cfms) all_point_outputs, words_point_out, updateGoldValue, updatePredValue, confirmGoldValue, confirmPredValue, bilinear = decoded_out if not self.decoder.training and classifiers_pred: for bi in range(batch_size): for si, sg in enumerate(GoldGates[bi, :]): if sg == self.gating_dict["confirm"]: SLOT_hdd = SLOT_hdd_all[bi, si, :] SLOT_hdd_ = SLOT_hdd.expand_as(encoded_cfms) p_cfm = bilinear(encoded_cfms.detach(), SLOT_hdd_).transpose(0, 1) pred_class = torch.argmax(p_cfm, dim=1).item() st = sub_gating_dict_verse[pred_class] gold = gold_y[bi][si] all_cfm_prediction['y_true'] += [ sub_gating_dict.get(gold, len(sub_gating_dict)) ] all_cfm_prediction['y_pred'] += [ sub_gating_dict.get(st, len(sub_gating_dict)) ] gate_outs = (PredictGates, PredictInts) return gate_outs, decoded_out, predict_gates, predict_dom, all_slots_refine, classifiers_pred # run training operation of one batch in one Epoch def Train(self, data, reset=0): self.encoder.train() self.decoder.train() if reset: self.reset() # put loss to 0 self.optimizer.zero_grad() # args["teacher_forcing_ratio"] = 0.5 use_teacher_forcing = random.random() < args["teacher_forcing_ratio"] gate_outs, decoded_out, predict_gates, predict_dom, all_slots_refine, _ = self.encode_and_decode( data, use_teacher_forcing) #encoded_outputs, X_hdd, SLOT_hdd_all, predict_gates, predict_dom = encoded_out all_point_outputs, words_point_out, updateGoldValue, updatePredValue, confirmGoldValue, confirmPredValue, _ = decoded_out # 计算Loss # loss_ptr assert len(updatePredValue) == len(updateGoldValue) and len( confirmPredValue) == len(confirmGoldValue) if len(updateGoldValue) > 0: #print("长度:",len(updateGoldValue)) # list; len = b * |J'| * m updateGoldValue_ = torch.tensor( updateGoldValue).contiguous() #torch.Size([len]) updatePredValue_ = torch.stack(updatePredValue, dim=0).view( -1, self.vocab_size).contiguous() # torch.Size([len, |V|]) loss_ptr = self.loss_func_ptr(updatePredValue_, updateGoldValue_) #ptr_num_total = updateGoldValue_.ne(PAD_token_id).data.sum().item() else: loss_ptr = torch.tensor(0) #ptr_num_total = 0 # loss_dom predict_dom = predict_dom.transpose(0, 1).contiguous().view( -1, predict_dom.size(-1)) target_dom = data["domain_focus"].contiguous().view(-1) loss_dom = self.loss_func_dom(predict_dom, target_dom) #dom_num_total = predict_gates.size(1) # loss_gate predict_gates = predict_gates.transpose(0, 1).contiguous().view( -1, predict_gates.size(-1)) target_gates = data["gating_label"].contiguous().view(-1) if args["training_gate_mode"] == 1: # training in subset of slots all_slots_refine = all_slots_refine.contiguous().view(-1) mask = (all_slots_refine != 0) #print(torch.equal(mask, all_slots_refine.bool())) predict_gates = predict_gates.masked_select(mask.unsqueeze(1)) target_gates = target_gates.masked_select(mask) if len(target_gates) > 0: loss_gate = self.loss_func_opr(predict_gates, target_gates) #opr_num_total = target_gates.size(-1) else: loss_gate = torch.tensor(0) #opr_num_total = 0 # loss_cfm if len(confirmGoldValue) > 0: true_confirm = torch.tensor(confirmGoldValue).contiguous() predict_confirm = torch.stack( confirmPredValue, dim=0).contiguous().view(-1, len(sub_gating_dict)).contiguous() loss_cfm = self.loss_func_cfm(predict_confirm, true_confirm) #cfm_num_total = true_confirm.size(0) else: loss_cfm = torch.tensor(0) #cfm_num_total = 0 # final loss loss = loss_ptr + loss_cfm + loss_gate + 2 * loss_dom # (6) backward self.loss_grad = loss self.loss += loss.data self.loss_ptr += loss_ptr.item() self.loss_cfm += loss_cfm.item() self.loss_gate += loss_gate.item() self.loss_dom += loss_dom.item() def evaluate(self, dev, matric_best, all_slots, early_stop=None): self.encoder.eval() self.decoder.eval() print("Start Evaluation...") print("training_gate_mode:", int(args['training_gate_mode'])) # default=0 print("testing_gate_mode:", int(args['testing_gate_mode'])) # default=0 all_prediction = {} all_dom_prediction = {'y_true': [], 'y_pred': []} all_slot_prediction = {'y_true': [], 'y_pred': []} all_slot_prediction_w = {'y_true': [], 'y_pred': []} all_cfm_prediction = {'y_true': [], 'y_pred': []} classifiers_pred = all_dom_prediction, all_slot_prediction, all_slot_prediction_w, all_cfm_prediction pbar = tqdm(enumerate(dev), total=len(dev)) # outside loop: for each batch in DataLoader for j, data_dev in pbar: # step 0:准备基本材料 dev_batch_size, max_input_seq = data_dev['input_seq_idx'].size() # 1.encode and decode use_teacher_forcing = False gate_outs, decoded_out, predict_gates, predict_dom, all_slots_refine, classifiers_pred = self.encode_and_decode( data_dev, use_teacher_forcing, classifiers_pred) PredictGates, PredictInts = gate_outs all_point_outputs, words_point_out, updateGoldValue, updatePredValue, confirmGoldValue, confirmPredValue, _ = decoded_out # inner loop: for each sample in the batch for bi in range(dev_batch_size): if data_dev["ID"][bi] not in all_prediction.keys(): all_prediction[data_dev["ID"][bi]] = {} all_prediction[data_dev["ID"][bi]][data_dev["turn_id"][bi]] = { "turn_belief": data_dev["turn_belief"][bi] } predict_belief_bsz_ptr = [] for si in range(len(all_slots)): st = words_point_out[bi][si] if st == "[NULL]": continue else: predict_belief_bsz_ptr.append(all_slots[si] + "-" + str(st)) all_prediction[data_dev["ID"][bi]][data_dev["turn_id"][bi]][ "pred_bs_ptr"] = predict_belief_bsz_ptr if set(data_dev["turn_belief"][bi]) != set( predict_belief_bsz_ptr) and args["genSample"]: Trues, Preds = sorted(list( (data_dev["turn_belief"][bi] ))), sorted(predict_belief_bsz_ptr) a = [x for x in Preds if x in Trues] Trues_, Preds_ = [x for x in Trues if x not in a ], [x for x in Preds if x not in a] tolookup = "Dialog:{0},turn:{1}\nTrue:{2}\nPred:{3}\n".format( data_dev["ID"][bi], data_dev["turn_id"][bi], Trues_, Preds_) with open('save/predict_errors.txt', 'a+') as f: f.write(tolookup + '\n') print( "The whole set is traversed. Results saved in dict [all_prediction]" ) if args["genSample"]: with open('save/all_cfm_prediction.pkl', 'wb') as f: pickle.dump(all_cfm_prediction, f, pickle.HIGHEST_PROTOCOL) # evaluate performance # classifier all_dom_prediction, all_slot_prediction, all_slot_prediction_w, all_cfm_prediction = classifiers_pred dom_cm, dom_joint_acc, dom_acc = self.compute_gate(all_dom_prediction, turn_l=5, class_n=2) slot_cm, slot_joint_acc, slot_acc = self.compute_gate( all_slot_prediction, turn_l=30, class_n=3) slot_cmR, slot_joint_accR, slot_accR = self.compute_gate( all_slot_prediction_w, turn_l=30, class_n=3) cfm_cm, cfm_joint_acc, cfm_acc = self.compute_gate(all_cfm_prediction) # joint accuracy joint_acc_score_ptr, prf_score_ptr, turn_acc_score_ptr = self.evaluate_metrics( all_prediction, "pred_bs_ptr", all_slots) F1_score_ptr, r_score, p_score = prf_score_ptr print("Slot Opr Acc:{0}\n{1}".format(slot_joint_acc, slot_cm)) print("Refined by Dom Opr:{0}\n{1}".format(slot_joint_accR, slot_cmR)) print("Dom Opr Acc:{0}\n{1}".format(dom_joint_acc, dom_cm)) print("Cfm Acc:{0}\n{1}".format(cfm_joint_acc, cfm_cm)) print("Joint Acc:{:.4f}; Turn Acc:{:.4f}; Joint F1c:{:.4f}".format( joint_acc_score_ptr, F1_score_ptr, turn_acc_score_ptr)) print("Precision:{:.4f}; Recall :{:.4f}".format(p_score, r_score)) joint_acc_score = joint_acc_score_ptr # (joint_acc_score_ptr + joint_acc_score_class)/2 F1_score = F1_score_ptr self.encoder.train(True) self.decoder.train(True) if (early_stop == 'F1'): if (F1_score >= matric_best): self.save_model(directory='save') print("Model Saved...") return F1_score else: if (joint_acc_score >= matric_best): self.save_model(directory='save') print("Model Saved...") return joint_acc_score def get_slots_refine(self, focus_dom_label_): slots_refine = [] for di, dg in enumerate(focus_dom_label_): slots_refine += [int(dg)] * ALL_ds_nb[EXPERIMENT_DOMAINS[di]] assert len(slots_refine) == len(self.slots) return torch.tensor(slots_refine) def evaluate_metrics(self, all_prediction, from_which, slot_temp): total, turn_acc, joint_acc, F1_pred, p_pred, r_pred, Count = 0, 0, 0, 0, 0, 0, 0 for d, v in all_prediction.items( ): # d:dialog ID,eg:PMUL1635.json; v:dict #print("dialog:", d, "turns num:", len(v),v) #assert list(v.keys()) == list(range(len(v))) for t in range(len(v)): cv = v[t] if set(cv["turn_belief"]) == set(cv[from_which]): joint_acc += 1 total += 1 # Compute prediction slot accuracy temp_acc = self.compute_acc(set(cv["turn_belief"]), set(cv[from_which]), slot_temp) turn_acc += temp_acc # Compute prediction joint F1 score temp_f1, temp_r, temp_p, count = self.compute_prf( set(cv["turn_belief"]), set(cv[from_which])) F1_pred += temp_f1 r_pred += temp_r p_pred += temp_p Count += count joint_acc_score = joint_acc / float(total) if total != 0 else 0 turn_acc_score = turn_acc / float(total) if total != 0 else 0 F1_score = F1_pred / float(Count) if Count != 0 else 0 r_score = r_pred / float(Count) if Count != 0 else 0 p_score = p_pred / float(Count) if Count != 0 else 0 return joint_acc_score, ( F1_score, r_score, p_score, ), turn_acc_score def compute_acc(self, gold, pred, slot_temp): # glod means groundtruth miss_gold = 0 miss_slot = [] for g in gold: if g not in pred: miss_gold += 1 miss_slot.append(g.rsplit("-", 1)[0]) wrong_pred = 0 for p in pred: if p not in gold and p.rsplit("-", 1)[0] not in miss_slot: wrong_pred += 1 ACC_TOTAL = len(slot_temp) # 1 - error_num / 30 = slot accuracy = ACC ACC = len(slot_temp) - miss_gold - wrong_pred ACC = ACC / float(ACC_TOTAL) return ACC def compute_prf(self, gold, pred): TP, FP, FN = 0, 0, 0 if len(gold) != 0: count = 1 for g in gold: if g in pred: TP += 1 else: FN += 1 # FN=miss_gold for p in pred: if p not in gold: FP += 1 precision = TP / float(TP + FP) if (TP + FP) != 0 else 0 recall = TP / float(TP + FN) if (TP + FN) != 0 else 0 F1 = 2 * precision * recall / float(precision + recall) if ( precision + recall) != 0 else 0 else: if len(pred) == 0: precision, recall, F1, count = 1, 1, 1, 1 else: precision, recall, F1, count = 0, 0, 0, 1 return F1, recall, precision, count def compute_gate(self, prediction, turn_l=1, class_n=None): y_true, y_pred = prediction["y_true"], prediction["y_pred"] labels = range(class_n) if class_n else None total, k = len(y_true), 0 if total > 0: cm = confusion_matrix(y_true, y_pred, labels=labels) #acc = sum([cm[i][i] for i in labels])/(sum([sum(cm[i]) for i in labels])) acc = accuracy_score(y_true, y_pred) else: cm = [] acc = 0 y_true = [y_true[i:i + turn_l] for i in range(0, len(y_true), turn_l)] y_pred = [y_pred[i:i + turn_l] for i in range(0, len(y_pred), turn_l)] for a, b in zip(y_true, y_pred): if a == b: k += 1 joint_acc = k / total if total != 0 else 0 return cm, joint_acc, acc ''' def _refine(self, _predict_gates, slots_refine): #_predict_gates = predict_gates.transpose(0, 1)[bi] # [|s|, 3] gating_pred_w_d = torch.zeros(_predict_gates.size(0)) for si in range(_predict_gates.size(-1)): k = slots_refine[si] if k==0: gating_pred_w_d[si] = 0 else: #gating_pred_w_d[si] = torch.argmax(_predict_gates[si, k:], dim=-1) gating_pred_w_d[si] = torch.argmax(_predict_gates[si], dim=-1) return gating_pred_w_d.long() ''' def demo(self, sample): """ sample is a dict consists of : sample['previous_utterances'] = previous_utterances sample['current_utterances'] = current_utterances sample['previous_dict'] = previous_dict sample['input_seq'] = get_input_seq(previous_utterances, current_utterances, previous_dict) sample['previous_generate_y'] = get_generate_y(sample['previous_dict']) """ print("start inference..") sample_data_set = LoadData([sample], word2id, 1) result = {} for i, data in enumerate(sample_data_set): batch_size, _ = data['input_seq_idx'].size() gate_outs, decoded_out, _, _, _, _ = self.encode_and_decode( data, False) PredictGates, PredictInts = gate_outs words_point_out = decoded_out[1] for bi in range(batch_size): gating_pred = PredictGates[bi] interest_dom_label_ = PredictInts[bi] D = [ d for di, d in enumerate(EXPERIMENT_DOMAINS) if interest_dom_label_[di] == 1 ] S = [ s for si, s in enumerate(ALL_SLOTS) if gating_pred[si].item() != 0 ] V = [ s + '-' + words_point_out[bi][si] for si, s in enumerate(ALL_SLOTS) if words_point_out[bi][si] != "[NULL]" ] result["domain"] = set(D) result["slot"] = set(S) result["value"] = set(V) return result
def train(args, train_dataset, model, tokenizer): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True) args.max_steps = args.epoch * len(train_dataloader) args.save_steps = len(train_dataloader) args.warmup_steps = len(train_dataloader) args.logging_steps = len(train_dataloader) args.num_train_epochs = args.epoch model.to(args.device) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.max_steps * 0.1, num_training_steps=args.max_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last') scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt') optimizer_last = os.path.join(checkpoint_last, 'optimizer.pt') if os.path.exists(scheduler_last): scheduler.load_state_dict(torch.load(scheduler_last)) if os.path.exists(optimizer_last): optimizer.load_state_dict(torch.load(optimizer_last)) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", args.max_steps) global_step = args.start_step tr_loss, logging_loss, avg_loss, tr_nb, tr_num, train_loss = 0.0, 0.0, 0.0, 0, 0, 0 best_acc = 0.0 # model.resize_token_embeddings(len(tokenizer)) model.zero_grad() for idx in range(args.start_epoch, int(args.num_train_epochs)): bar = train_dataloader tr_num = 0 train_loss = 0 for step, batch in enumerate(bar): inputs = batch[0].to(args.device) p_inputs = batch[1].to(args.device) n_inputs = batch[2].to(args.device) labels = batch[3].to(args.device) model.train() loss, vec = model(inputs, p_inputs, n_inputs, labels) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() tr_num += 1 train_loss += loss.item() if avg_loss == 0: avg_loss = tr_loss avg_loss = round(train_loss / tr_num, 5) if (step + 1) % 100 == 0: logger.info("epoch {} step {} loss {}".format( idx, step + 1, avg_loss)) #bar.set_description("epoch {} loss {}".format(idx,avg_loss)) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 output_flag = True avg_loss = round( np.exp((tr_loss - logging_loss) / (global_step - tr_nb)), 4) if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logging_loss = tr_loss tr_nb = global_step if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer, eval_when_training=True) for key, value in results.items(): logger.info(" %s = %s", key, round(value, 4)) # Save model checkpoint tr_num = 0 train_loss = 0 if results['eval_map'] > best_acc: best_acc = results['eval_map'] logger.info(" " + "*" * 20) logger.info(" Best map:%s", round(best_acc, 4)) logger.info(" " + "*" * 20) checkpoint_prefix = 'checkpoint-best-map' output_dir = os.path.join( args.output_dir, '{}'.format(checkpoint_prefix)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module') else model output_dir = os.path.join(output_dir, '{}'.format('model.bin')) torch.save(model_to_save.state_dict(), output_dir) logger.info("Saving model checkpoint to %s", output_dir)
class AGEM: def __init__(self, device, n_classes, **kwargs): self.lr = kwargs.get('lr', 3e-5) self.write_prob = kwargs.get('write_prob') self.replay_rate = kwargs.get('replay_rate') self.replay_every = kwargs.get('replay_every') self.device = device self.model = TransformerClsModel(model_name=kwargs.get('model'), n_classes=n_classes, max_length=kwargs.get('max_length'), device=device) self.memory = ReplayMemory(write_prob=self.write_prob, tuple_size=2) logger.info('Loaded {} as model'.format(self.model.__class__.__name__)) self.loss_fn = nn.CrossEntropyLoss() self.optimizer = AdamW( [p for p in self.model.parameters() if p.requires_grad], lr=self.lr) def save_model(self, model_path): checkpoint = self.model.state_dict() torch.save(checkpoint, model_path) def load_model(self, model_path): checkpoint = torch.load(model_path) self.model.load_state_dict(checkpoint) def compute_grad(self, orig_grad, ref_grad): with torch.no_grad(): flat_orig_grad = torch.cat([torch.flatten(x) for x in orig_grad]) flat_ref_grad = torch.cat([torch.flatten(x) for x in ref_grad]) dot_product = torch.dot(flat_orig_grad, flat_ref_grad) if dot_product >= 0: return orig_grad proj_component = dot_product / torch.dot(flat_ref_grad, flat_ref_grad) modified_grad = [ o - proj_component * r for (o, r) in zip(orig_grad, ref_grad) ] return modified_grad def train(self, dataloader, n_epochs, log_freq): self.model.train() for epoch in range(n_epochs): all_losses, all_predictions, all_labels = [], [], [] iter = 0 for text, labels in dataloader: labels = torch.tensor(labels).to(self.device) input_dict = self.model.encode_text(text) output = self.model(input_dict) loss = self.loss_fn(output, labels) self.optimizer.zero_grad() params = [ p for p in self.model.parameters() if p.requires_grad ] orig_grad = torch.autograd.grad(loss, params) mini_batch_size = len(labels) replay_freq = self.replay_every // mini_batch_size replay_steps = int(self.replay_every * self.replay_rate / mini_batch_size) if self.replay_rate != 0 and (iter + 1) % replay_freq == 0: ref_grad_sum = None for _ in range(replay_steps): ref_text, ref_labels = self.memory.read_batch( batch_size=mini_batch_size) ref_labels = torch.tensor(ref_labels).to(self.device) ref_input_dict = self.model.encode_text(ref_text) ref_output = self.model(ref_input_dict) ref_loss = self.loss_fn(ref_output, ref_labels) ref_grad = torch.autograd.grad(ref_loss, params) if ref_grad_sum is None: ref_grad_sum = ref_grad else: ref_grad_sum = [ x + y for (x, y) in zip(ref_grad, ref_grad_sum) ] final_grad = self.compute_grad(orig_grad, ref_grad_sum) else: final_grad = orig_grad for param, grad in zip(params, final_grad): param.grad = grad.data self.optimizer.step() loss = loss.item() pred = models.utils.make_prediction(output.detach()) all_losses.append(loss) all_predictions.extend(pred.tolist()) all_labels.extend(labels.tolist()) iter += 1 self.memory.write_batch(text, labels) if iter % log_freq == 0: acc, prec, rec, f1 = models.utils.calculate_metrics( all_predictions, all_labels) logger.info( 'Epoch {} metrics: Loss = {:.4f}, accuracy = {:.4f}, precision = {:.4f}, recall = {:.4f}, ' 'F1 score = {:.4f}'.format(epoch + 1, np.mean(all_losses), acc, prec, rec, f1)) all_losses, all_predictions, all_labels = [], [], [] def evaluate(self, dataloader): all_losses, all_predictions, all_labels = [], [], [] self.model.eval() for text, labels in dataloader: labels = torch.tensor(labels).to(self.device) input_dict = self.model.encode_text(text) with torch.no_grad(): output = self.model(input_dict) loss = self.loss_fn(output, labels) loss = loss.item() pred = models.utils.make_prediction(output.detach()) all_losses.append(loss) all_predictions.extend(pred.tolist()) all_labels.extend(labels.tolist()) acc, prec, rec, f1 = models.utils.calculate_metrics( all_predictions, all_labels) logger.info( 'Test metrics: Loss = {:.4f}, accuracy = {:.4f}, precision = {:.4f}, recall = {:.4f}, ' 'F1 score = {:.4f}'.format(np.mean(all_losses), acc, prec, rec, f1)) return acc, prec, rec, f1 def training(self, train_datasets, **kwargs): n_epochs = kwargs.get('n_epochs', 1) log_freq = kwargs.get('log_freq', 50) mini_batch_size = kwargs.get('mini_batch_size') train_dataset = data.ConcatDataset(train_datasets) train_dataloader = data.DataLoader( train_dataset, batch_size=mini_batch_size, shuffle=False, collate_fn=datasets.utils.batch_encode) self.train(dataloader=train_dataloader, n_epochs=n_epochs, log_freq=log_freq) def testing(self, test_datasets, **kwargs): mini_batch_size = kwargs.get('mini_batch_size') accuracies, precisions, recalls, f1s = [], [], [], [] for test_dataset in test_datasets: logger.info('Testing on {}'.format( test_dataset.__class__.__name__)) test_dataloader = data.DataLoader( test_dataset, batch_size=mini_batch_size, shuffle=False, collate_fn=datasets.utils.batch_encode) acc, prec, rec, f1 = self.evaluate(dataloader=test_dataloader) accuracies.append(acc) precisions.append(prec) recalls.append(rec) f1s.append(f1) logger.info( 'Overall test metrics: Accuracy = {:.4f}, precision = {:.4f}, recall = {:.4f}, ' 'F1 score = {:.4f}'.format(np.mean(accuracies), np.mean(precisions), np.mean(recalls), np.mean(f1s)))
def train_ensemble(H_Bert, H_XLM, dataloader): train_loss = 0 H_Bert.to(device) H_XLM.to(device) from transformers import get_linear_schedule_with_warmup optimizer_Bert = AdamW(H_Bert.parameters(), lr=5e-5, eps=1e-8) optimizer_XLM = AdamW(H_XLM.parameters(), lr=5e-5, eps=1e-8) total_steps = len(dataloader) * epochs scheduler_Bert = get_linear_schedule_with_warmup( optimizer_Bert, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) scheduler_XLM = get_linear_schedule_with_warmup( optimizer_XLM, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) H_Bert.train() H_XLM.train() total_loss = 0 for step, batch in enumerate(dataloader): if step % 2000 == 0 and not step == 0: # Calculate elapsed time in minutes. # Report progress. print(' Batch {:>5,} of {:>5,}.'.format(step, len(dataloader))) b_input_ids = batch[0].long().to(device) b_input_mask = batch[1].long().to(device) b_labels = batch[2].float().to(device) optimizer_Bert.zero_grad() optimizer_XLM.zero_grad() Hidden_Bert = H_Bert( b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, ) Hidden_XLM = H_XLM( b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels, ) Hidden = torch.cat((Hidden_Bert, Hidden_XLM), dim=2).to(device) Hidden = nn.Dropout(0.1)(Hidden).permute(0, 2, 1).to(device) pooled = F.max_pool1d(Hidden, Hidden.shape[2]).squeeze(2).to(device) logits = nn.Linear(pooled.shape[1], 6).to(device)(pooled) loss_fct = nn.BCEWithLogitsLoss().to(device) # .to(device) loss = loss_fct(logits, b_labels) loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(H_Bert.parameters(), 1.0) torch.nn.utils.clip_grad_norm_(H_XLM.parameters(), 1.0) optimizer_XLM.step() optimizer_Bert.step() scheduler_XLM.step() scheduler_Bert.step() train_loss += loss return train_loss / len(dataloader)
def train_bert(args): from transformers import AdamW import gc from transformers import get_linear_schedule_with_warmup from sklearn.metrics import f1_score, precision_score, recall_score assert os.path.exists( str(args.train_data_file )), "The argument --train_data_file should be a valid file" assert os.path.exists( str(args.val_data_file )), "The argument --val_train_data_file should be a valid file" if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') # load model cc = ConcatenatedClassifier( classifier_config_dir=args.classifier_config_dir, device=device, task_type='train', n_clf_layers=args.n_clf_layers, use_dm=args.use_dm, use_pm=args.use_pm, use_rt=args.use_rt, use_bio=args.use_bio, use_name=args.use_name, use_network=args.use_network, use_count=args.use_count) cc.to(device) # load data train_data = [] with open(args.train_data_file) as f: for line in f: train_data.append(json.loads(line)) val_data = [] with open(args.val_data_file) as f: for line in f: val_data.append(json.loads(line)) train_dataset = CustomDataset(data_list=train_data, task_type='train', use_pm=args.use_pm, use_dm=args.use_dm, use_rt=args.use_rt, use_bio=args.use_bio, use_network=args.use_network, use_count=args.use_count, use_name=args.use_name, max_samples=args.max_samples) train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4, collate_fn=train_dataset.collate_fn) val_dataset = CustomDataset(data_list=val_data, task_type='test', use_pm=args.use_pm, use_dm=args.use_dm, use_rt=args.use_rt, use_bio=args.use_bio, use_network=args.use_network, use_count=args.use_count, use_name=args.use_name, max_samples=args.max_samples) val_data_loader = DataLoader(val_dataset, batch_size=args.batch_size * 4, shuffle=False, num_workers=4, collate_fn=train_dataset.collate_fn) print("Starting evaluation") save_dir = args.output_dir # create save directory to store models and training loss if not os.path.exists(save_dir): os.makedirs(save_dir) # hyperparameters eps = args.eps max_grad_norm = args.max_grad_norm num_training_steps = len(train_data_loader) * args.n_epochs num_warmup_steps = args.num_warmup_steps optimizer = AdamW( cc.parameters(), lr=args.lr, eps=eps ) # To reproduce BertAdam specific behavior set correct_bias=False scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler # training progress total_steps = 0 for epoch in range(args.n_epochs): # Reset the total loss for this epoch. total_loss = 0 # For each batch of training data... pbar = tqdm(train_data_loader) for step, batch in enumerate(pbar): # validate if total_steps % args.valid_interval == 0 and not total_steps == 0: gc.collect() torch.cuda.empty_cache() y_true, y_pred = [], [] with torch.no_grad(): cc.eval() valid_loss = 0 total_samples = 0 for batch in val_data_loader: y_true.extend(batch['text'][0].tolist()) loss, logits = cc(batch) y_pred.extend(logits.argmax(1).tolist()) valid_loss += loss.item() * len(logits) total_samples += len(logits) valid_loss /= total_samples # save valid loss with open(os.path.join(save_dir, 'valid-loss.txt'), 'a') as f: f.write('\t'.join( [str(x) for x in [total_steps, round(valid_loss, 4)]]) + '\n') # compute metrics and save them separately for name, fun in [('f1', f1_score), ('precision', precision_score), ('recall', recall_score)]: with open(os.path.join(save_dir, 'valid-%s.txt' % name), 'a') as f: f.write('Step %d\n' % total_steps) macro = fun(y_true=y_true, y_pred=y_pred, average='macro') classwise = fun(y_true=y_true, y_pred=y_pred, average=None) f.write('%1.3f\n' % macro) f.write( '\t'.join([str(round(x, 3)) for x in classwise]) + '\n') # clear any memory and gpu gc.collect() torch.cuda.empty_cache() cc.train() if total_steps % args.checkpoint_interval == 0 and not total_steps == 0: # save models torch.save(obj=cc.state_dict(), f=os.path.join(save_dir, '%d-steps-cc.pth' % (total_steps))) # collect garbage torch.cuda.empty_cache() gc.collect() optimizer.zero_grad() # compute for one step loss, logits = cc(batch) # gradient descent and update loss.backward() torch.nn.utils.clip_grad_norm_(cc.parameters(), max_grad_norm) optimizer.step() scheduler.step() total_loss += loss.item() pbar.set_description("[%d] Loss at %d/%dth batch: %1.3f" % (int( os.getpid()), step + 1, len(train_data_loader), loss.item())) with open(os.path.join(save_dir, 'training-loss.txt'), 'a') as f: f.write( '\t'.join([str(x) for x in [step, round(loss.item(), 4)]]) + '\n') total_steps += 1 print("Epoch %d complete! %d steps" % (epoch, total_steps)) return
def train(data_path, model_path=None, train_size=0.9, model_params=None): """ :param model_params: finetune, label_weights, dropout, pre_trained_model_path, batch_size, lr, eps, epochs """ set_seed(2020) # updates params if not isinstance(model_params, dict): model_params = dict() finetune = model_params.get("finetune", True) label_weights = model_params.get("label_weights", None) dropout = model_params.get("dropout", 0.3) embedding_dim = model_params.get("embedding_dim", 200) vocab_path = model_params.get("vocab_path", VOCAB_PATH) emb_path = model_params.get("emb_path", EMB_PATH) emb_sep = model_params.get("emb_sep", " ") kernel_size = model_params.get("kernel_size", [3, 4, 5]) kernel_num = model_params.get("kernel_num", 100) batch_size = model_params.get("batch_size", 64) lr = model_params.get("lr", 1e-4) eps = model_params.get("eps", 1e-8) epochs = model_params.get("epochs", 5) n_jobs = model_params.get("n_jobs", 1) # load training data emb_datasets = EmbeddingDatasets(embedding_path=emb_path, vocab_path=vocab_path, embedding_dim=embedding_dim, sep=emb_sep) cnn_datasets = CNNTextDatasets(path=data_path, sep="\t", vocab2idx=emb_datasets.vocab2index) train_data, test_data = cnn_datasets.load(n_jobs=n_jobs, train_size=train_size, batch_size=batch_size) total_steps = len(train_data) * epochs # init the albert model model = CNNText(num_labels=cnn_datasets.num_labels, finetune=finetune, label_weights=label_weights, dropout=dropout, embeddings=emb_datasets.embeddings, kernel_size=kernel_size, kernel_num=kernel_num) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model.to(device) optimizer = AdamW(model.parameters(), lr=lr, eps=eps) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) for name, param in model.named_parameters(): print(name, param.shape, param.device, param.requires_grad) for epoch in range(epochs): start_time = time.time() # training model.train() train_loss = 0 avg_train_loss = 0 for i, train in enumerate(train_data): train_input_ids = train[0].to(device) train_labels = train[1].to(device) logits, loss = model(sent_ids=train_input_ids, labels=train_labels) train_loss += loss.item() avg_train_loss = train_loss / (i + 1) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() batch_print("[Epoch] \033[34m{:0>3d}\033[0m".format(epoch), "[Batch] \033[34m{:0>5d}\033[0m".format(i), "[lr] \033[34m{:0>.6f}\033[0m".format(scheduler.get_lr()[0]), "[avg train loss] \033[34m{:0>.4f}\033[0m".format(avg_train_loss), "[time] \033[34m{:<.0f}s\033[0m".format(time.time() - start_time), flag="batch") # on-time evaluate model model.eval() test_loss = 0 avg_test_loss = 0 pred_labels, test_labels = [], [] for i, test in enumerate(test_data): test_input_ids = test[0].to(device) test_label = test[1].to(device) with torch.no_grad(): pred_label, loss = model(sent_ids=test_input_ids, labels=test_label) pred_labels.append(torch.argmax(pred_label.cpu(), -1).float()) test_labels.append(torch.argmax(test_label.cpu(), -1)) test_loss += loss avg_test_loss = test_loss / (i + 1) batch_print("[Epoch] \033[34m{:0>3d}\033[0m".format(epoch), "[lr] \033[34m{:0>.6f}\033[0m".format(scheduler.get_lr()[0]), "[avg train loss] \033[34m{:0>.4f}\033[0m".format(avg_train_loss), "[avg test lost] \033[34m{:>0.4f}\033[0m".format(avg_test_loss), "[time] \033[34m{:<.0f}s\033[0m".format(time.time() - start_time), flag="epoch") if epoch == epochs - 1: acc = accuracy_score(torch.cat(pred_labels, dim=-1).numpy(), torch.cat(test_labels, dim=-1).numpy()) print("The model test accuracy is: \033[34m{:.5}\033[0m".format(acc)) # save model if model_path is not None: _save_model(path=model_path, model=model, vocab2index=emb_datasets.vocab2index, max_length=cnn_datasets.max_length, label2index=cnn_datasets.label2idx)
class SeqPrototypicalNetwork(nn.Module): def __init__(self, config): super(SeqPrototypicalNetwork, self).__init__() self.base_path = config['base_path'] self.early_stopping = config['early_stopping'] self.lr = config.get('meta_lr', 1e-3) self.weight_decay = config.get('meta_weight_decay', 0.0) if 'seq' in config['learner_model']: self.learner = RNNSequenceModel(config['learner_params']) elif 'mlp' in config['learner_model']: self.learner = MLPModel(config['learner_params']) elif 'bert' in config['learner_model']: self.learner = BERTSequenceModel(config['learner_params']) self.num_outputs = config['learner_params']['num_outputs'] self.vectors = config.get('vectors', 'glove') if self.vectors == 'elmo': self.elmo = Elmo( options_file= "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json", weight_file= "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5", num_output_representations=1, dropout=0, requires_grad=False) elif self.vectors == 'glove': self.glove = torchtext.vocab.GloVe(name='840B', dim=300) elif self.vectors == 'bert': self.bert_tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased') self.loss_fn = {} for task in config['learner_params']['num_outputs']: self.loss_fn[task] = nn.CrossEntropyLoss(ignore_index=-1) if config.get('trained_learner', False): self.learner.load_state_dict( torch.load( os.path.join(self.base_path, 'saved_models', config['trained_learner']))) logger.info('Loaded trained learner model {}'.format( config['trained_learner'])) self.device = torch.device(config.get('device', 'cpu')) self.to(self.device) if self.vectors == 'elmo': self.elmo.to(self.device) self.initialize_optimizer_scheduler() def initialize_optimizer_scheduler(self): learner_params = [ p for p in self.learner.parameters() if p.requires_grad ] if isinstance(self.learner, BERTSequenceModel): self.optimizer = AdamW(learner_params, lr=self.lr, weight_decay=self.weight_decay) self.lr_scheduler = get_constant_schedule_with_warmup( self.optimizer, num_warmup_steps=100) else: self.optimizer = optim.Adam(learner_params, lr=self.lr, weight_decay=self.weight_decay) self.lr_scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=500, gamma=0.5) def vectorize(self, batch_x, batch_len, batch_y): with torch.no_grad(): if self.vectors == 'elmo': char_ids = batch_to_ids(batch_x) char_ids = char_ids.to(self.device) batch_x = self.elmo(char_ids)['elmo_representations'][0] elif self.vectors == 'glove': max_batch_len = max(batch_len) vec_batch_x = torch.ones((len(batch_x), max_batch_len, 300)) for i, sent in enumerate(batch_x): sent_emb = self.glove.get_vecs_by_tokens( sent, lower_case_backup=True) vec_batch_x[i, :len(sent_emb)] = sent_emb batch_x = vec_batch_x.to(self.device) elif self.vectors == 'bert': max_batch_len = max(batch_len) + 2 input_ids = torch.zeros((len(batch_x), max_batch_len)).long() for i, sent in enumerate(batch_x): sent_token_ids = self.bert_tokenizer.encode( sent, add_special_tokens=True) input_ids[i, :len(sent_token_ids)] = torch.tensor( sent_token_ids) batch_x = input_ids.to(self.device) batch_len = torch.tensor(batch_len).to(self.device) batch_y = torch.tensor(batch_y).to(self.device) return batch_x, batch_len, batch_y def forward(self, episodes, updates=1, testing=False): query_losses, query_accuracies, query_precisions, query_recalls, query_f1s = [], [], [], [], [] n_episodes = len(episodes) for episode_id, episode in enumerate(episodes): batch_x, batch_len, batch_y = next(iter(episode.support_loader)) batch_x, batch_len, batch_y = self.vectorize( batch_x, batch_len, batch_y) self.train() support_repr, support_label = [], [] batch_x_repr = self.learner(batch_x, batch_len) support_repr.append(batch_x_repr) support_label.append(batch_y) prototypes = self._build_prototypes(support_repr, support_label, episode.n_classes) # Run on query query_loss = 0.0 all_predictions, all_labels = [], [] for module in self.learner.modules(): if isinstance(module, nn.Dropout): module.eval() for n_batch, (batch_x, batch_len, batch_y) in enumerate(episode.query_loader): batch_x, batch_len, batch_y = self.vectorize( batch_x, batch_len, batch_y) batch_x_repr = self.learner(batch_x, batch_len) output = self._normalized_distances(prototypes, batch_x_repr) output = output.view(output.size()[0] * output.size()[1], -1) batch_y = batch_y.view(-1) loss = self.loss_fn[episode.base_task](output, batch_y) query_loss += loss.item() if not testing: self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.lr_scheduler.step() relevant_indices = torch.nonzero( batch_y != -1).view(-1).detach() all_predictions.extend( make_prediction(output[relevant_indices]).cpu()) all_labels.extend(batch_y[relevant_indices].cpu()) query_loss /= n_batch + 1 # Calculate metrics accuracy, precision, recall, f1_score = utils.calculate_metrics( all_predictions, all_labels, binary=False) logger.info( 'Episode {}/{}, task {} [query set]: Loss = {:.5f}, accuracy = {:.5f}, precision = {:.5f}, ' 'recall = {:.5f}, F1 score = {:.5f}'.format( episode_id + 1, n_episodes, episode.task_id, query_loss, accuracy, precision, recall, f1_score)) query_losses.append(query_loss) query_accuracies.append(accuracy) query_precisions.append(precision) query_recalls.append(recall) query_f1s.append(f1_score) return query_losses, query_accuracies, query_precisions, query_recalls, query_f1s def _build_prototypes(self, data_repr, data_label, num_outputs): n_dim = data_repr[0].shape[2] data_repr = torch.cat(tuple([x.view(-1, n_dim) for x in data_repr]), dim=0) data_label = torch.cat(tuple([y.view(-1) for y in data_label]), dim=0) prototypes = torch.zeros((num_outputs, n_dim), device=self.device) for c in range(num_outputs): idx = torch.nonzero(data_label == c).view(-1) if idx.nelement() != 0: prototypes[c] = torch.mean(data_repr[idx], dim=0) return prototypes def _normalized_distances(self, prototypes, q): d = torch.stack(tuple( [q.sub(p).pow(2).sum(dim=-1) for p in prototypes]), dim=-1) return d.neg()
class DialoGPT: def __init__(self, model_size): self._tokenizer = AutoTokenizer.from_pretrained( f"microsoft/DialoGPT-{model_size}") if model_size in ['medium', 'large']: devices = [ f'cuda:{i % torch.cuda.device_count()}' for i in range(2 if model_size == 'medium' else 4) ] self._model = GPT2LMHeadModelMultiDevicesWrapper(model_size, devices=devices) else: self._model = AutoModelWithLMHead.from_pretrained( f"microsoft/DialoGPT-{model_size}").to('cuda') self._optimizer = None self._lr_scheduler = None self._global_step = 0 self._dataset = {} self._eval_steps = None self._log_dir = None self._log_file = None self._best_dev_loss = None def creat_log_dir(self, eval_steps, label): self._log_dir = f'{label}_training_logs' self._eval_steps = eval_steps self._best_dev_loss = float('inf') os.makedirs(os.path.join(self._log_dir, 'models'), exist_ok=True) os.makedirs(os.path.join(self._log_dir, 'generations'), exist_ok=True) self._log_file = open(os.path.join(self._log_dir, 'log.txt'), 'w') def save_model(self, path): torch.save(self._model.state_dict(), path) print(f'Model saved in {path}.') def load_model(self, path): self._model.load_state_dict(torch.load(path, map_location='cuda')) print(f'Model {path} loaded.') def get_optimizer(self, lr, train_steps, warmup_steps, weight_decay, adam_epsilon): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in self._model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": weight_decay }, { "params": [ p for n, p in self._model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }] self._optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon) self._lr_scheduler = get_linear_schedule_with_warmup( self._optimizer, num_warmup_steps=warmup_steps, num_training_steps=train_steps) def load_data(self, split, dialogues, max_length=1024): self._dataset[split] = [] for dialogue in dialogues: tokens, mask = [], [] for i, utterance in enumerate(dialogue): token_ids = self._tokenizer.encode(utterance + self._tokenizer.eos_token) tokens.extend(token_ids) mask.extend([i % 2] * len(token_ids)) tokens = tokens[:max_length] mask = mask[:max_length] self._dataset[split].append( TextDataExample(dialogue=dialogue, tokens=tokens, mask=mask)) def train_epoch(self, batch_size): assert 'train' in self._dataset self._model.train() random.shuffle(self._dataset['train']) for i in trange(0, len(self._dataset['train']), batch_size, desc='Training Epoch'): batch = self._dataset['train'][i:i + batch_size] self._optimizer.zero_grad() for example in batch: loss = self._get_loss(example) / batch_size loss.backward() self._optimizer.step() self._lr_scheduler.step() self._global_step += 1 if self._global_step % self._eval_steps == 0: self.gen_log() def evaluate(self): assert 'dev' in self._dataset self._model.eval() loss_list = [] for example in self._dataset['dev']: with torch.no_grad(): loss = self._get_loss(example) loss_list.append(loss.item()) return sum(loss_list) / len(loss_list) def _get_loss(self, example): inputs = torch.tensor([example.tokens]).to(device='cuda') logits = self._model(inputs)[0] # Shift so that tokens < n predict n shift_logits = logits[:, :-1].contiguous() shift_labels = inputs[:, 1:].contiguous() shift_masks = torch.tensor(example.mask[1:], dtype=torch.float).to('cuda') criterion = nn.CrossEntropyLoss(reduction='none') loss = criterion(input=shift_logits.view(-1, shift_logits.size(-1)), target=shift_labels.view(-1)) return torch.sum(loss * shift_masks) / torch.sum(shift_masks) def generate(self, chat_history): history_ids = [] for sent in chat_history: history_ids.extend( self._tokenizer.encode(sent + self._tokenizer.eos_token)) history_ids = torch.tensor([history_ids]).to('cuda') gen_ids = self._model.generate( history_ids, num_beams=10, length_penalty=2.0, max_length=min(1024, history_ids.shape[-1] + 200), min_length=history_ids.shape[-1] + 50, no_repeat_ngram_size=3, pad_token_id=self._tokenizer.eos_token_id, ) return self._tokenizer.decode(gen_ids[0, history_ids.shape[-1]:], skip_special_tokens=True) def gen_log(self): eval_loss = self.evaluate() print(f'Global Step: {self._global_step}, Eval Loss: {eval_loss}', file=self._log_file) if eval_loss < self._best_dev_loss: self._best_dev_loss = eval_loss self.save_model(f'{self._log_dir}/best_model.pt') print('Best Model Updated.', file=self._log_file) self._log_file.flush() generation_file = open( f'{self._log_dir}/generations/step{self._global_step}.txt', 'w') for i in range(20): chat_history = self._dataset['dev'][i].dialogue[:-1] truth_text = self._dataset['dev'][i].dialogue[-1] with torch.no_grad(): gen_text = self.generate(chat_history) print('CHAT_HISTORY:\n', file=generation_file) for u in chat_history: print('>>>', u, file=generation_file) print('-' * 50, f'\nGENERATION: {gen_text}\n', '-' * 50, file=generation_file) print('-' * 50, f'\nTRUTH: {truth_text}\n', '=' * 50, '\n\n', file=generation_file) generation_file.flush() @property def datasets(self): return self._dataset @property def get_lr(self): return self._lr_scheduler.get_lr()
def main(): # Parse the arguments args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.model_name_or_path) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForSeq2SeqLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForSeq2SeqLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert ( args.target_lang is not None and args.source_lang is not None ), "mBart requires --target_lang and --source_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") prefix = args.source_prefix if args.source_prefix is not None else "" # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): if args.source_lang is not None: tokenizer.src_lang = args.source_lang if args.target_lang is not None: tokenizer.tgt_lang = args.target_lang # Get the language codes for input/target. source_lang = args.source_lang.split("_")[0] target_lang = args.target_lang.split("_")[0] padding = "max_length" if args.pad_to_max_length else False # Temporarily set max_target_length for training. max_target_length = args.max_target_length padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs with accelerator.main_process_first(): processed_datasets = raw_datasets.map( preprocess_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if accelerator.use_fp16 else None, ) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) metric = load_metric("sacrebleu") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() if args.val_max_target_length is None: args.val_max_target_length = args.max_target_length gen_kwargs = { "max_length": args.val_max_target_length if args is not None else config.max_length, "num_beams": args.num_beams, } for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs, ) generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id ) labels = batch["labels"] if not args.pad_to_max_length: # If we did not pad to max length, we need to pad the labels too labels = accelerator.pad_across_processes(batch["labels"], dim=1, pad_index=tokenizer.pad_token_id) generated_tokens = accelerator.gather(generated_tokens).cpu().numpy() labels = accelerator.gather(labels).cpu().numpy() if args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) eval_metric = metric.compute() logger.info({"bleu": eval_metric["score"]}) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
def bert_train_lm(model, device, train_dataloader, output_dir, num_epochs, warmup_proportion, weight_decay, learning_rate, adam_epsilon, save_best=False, eval_dataloader=None): """Training loop for bert fine-tuning. Save best works with F1 only currently.""" t_total = len(train_dataloader) * num_epochs warmup_steps = len(train_dataloader) * warmup_proportion no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) train_iterator = trange(int(num_epochs), desc="Epoch") model.to(device) tr_loss_track = [] num_iterations = 0 early_stopping = 0 output_filename = os.path.join(output_dir, 'pytorch_model.bin') perplexity_history = float('inf') for _ in train_iterator: model.train() model.zero_grad() tr_loss = 0 nr_batches = 0 epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): input_ids, input_mask, labels, mlm_labels = batch input_ids = input_ids.to(device) input_mask = input_mask.to(device) labels = labels.to(device) mlm_labels = mlm_labels.to(device) optimizer.zero_grad() outputs = model(input_ids, attention_mask=input_mask, masked_lm_labels=mlm_labels, next_sentence_label=labels) loss = outputs[0] loss.backward() optimizer.step() scheduler.step() #tr_loss += loss.detach().item() nr_batches += 1 model.zero_grad() del loss del input_ids del input_mask del labels del mlm_labels if save_best: if eval_dataloader == None: print("Please provide evaluation data.") sys.exit() perplexity = bert_evaluate_lm(model, eval_dataloader, device) print(type(perplexity)) if perplexity_history > perplexity: model.save_pretrained(output_dir) torch.save(model.state_dict(), output_filename) print("The new value of perplexity of " + str(perplexity) + " is lower then the old value of " + str(perplexity_history) + ".") print("Saving the new model...") perplexity_history = perplexity else: print("The new value of perplexity of " + str(perplexity) + " is not lower then the old value of " + str(perplexity_history) + ".") if (perplexity_history < perplexity) and early_stopping == 1: break elif (perplexity_history < perplexity) and early_stopping == 0: early_stopping = 1 elif (perplexity_history > perplexity) and early_stopping == 1: early_stopping = 0 tr_loss = tr_loss / nr_batches tr_loss_track.append(tr_loss) num_iterations += 1 if not save_best: model.save_pretrained(output_dir) # tokenizer.save_pretrained(output_dir) torch.save(model.state_dict(), output_filename) return tr_loss_track, num_iterations
def bert_train(model, device, train_dataloader, eval_dataloader, output_dir, num_epochs, warmup_proportion, weight_decay, learning_rate, adam_epsilon, save_best=False): """Training loop for bert fine-tuning. Save best works with F1 only currently.""" t_total = len(train_dataloader) * num_epochs warmup_steps = len(train_dataloader) * warmup_proportion no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) train_iterator = trange(int(num_epochs), desc="Epoch") model.to(device) tr_loss_track = [] eval_metric_track = [] output_filename = os.path.join(output_dir, 'pytorch_model.bin') f1 = float('-inf') for _ in train_iterator: model.train() model.zero_grad() tr_loss = 0 nr_batches = 0 epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): tr_loss = 0 input_ids, input_mask, labels = batch input_ids = input_ids.to(device) input_mask = input_mask.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(input_ids, attention_mask=input_mask, labels=labels) loss = outputs[0] loss.backward() optimizer.step() scheduler.step() tr_loss += loss.item() nr_batches += 1 model.zero_grad() print("Evaluating the model on the evaluation split...") metrics = bert_evaluate(model, eval_dataloader, device) eval_metric_track.append(metrics) if save_best: if f1 < metrics['f1']: model.save_pretrained(output_dir) torch.save(model.state_dict(), output_filename) print("The new value of f1 score of " + str(metrics['f1']) + " is higher then the old value of " + str(f1) + ".") print("Saving the new model...") f1 = metrics['f1'] else: print("The new value of f1 score of " + str(metrics['f1']) + " is not higher then the old value of " + str(f1) + ".") tr_loss = tr_loss / nr_batches tr_loss_track.append(tr_loss) if not save_best: model.save_pretrained(output_dir) torch.save(model.state_dict(), output_filename) return tr_loss_track, eval_metric_track
def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. args = parse_args() distributed_args = accelerate.DistributedDataParallelKwargs( find_unused_parameters=True) accelerator = Accelerator(kwargs_handlers=[distributed_args]) device = accelerator.device # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", filename=f'xmc_{args.dataset}_{args.mode}_{args.log}.log', datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) ch = logging.StreamHandler(sys.stdout) logger.addHandler(ch) if accelerator.is_local_main_process: transformers.utils.logging.set_verbosity_info() else: transformers.utils.logging.set_verbosity_error() logger.info(sent_trans.__file__) # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Load pretrained model and tokenizer if args.model_name_or_path == 'bert-base-uncased' or args.model_name_or_path == 'sentence-transformers/paraphrase-mpnet-base-v2': query_encoder = build_encoder( args.model_name_or_path, args.max_label_length, args.pooling_mode, args.proj_emb_dim, ) else: query_encoder = sent_trans.SentenceTransformer(args.model_name_or_path) tokenizer = query_encoder._first_module().tokenizer block_encoder = query_encoder model = DualEncoderModel(query_encoder, block_encoder, args.mode) model = model.to(device) # the whole label set data_path = os.path.join(os.path.abspath(os.getcwd()), 'dataset', args.dataset) all_labels = pd.read_json(os.path.join(data_path, 'lbl.json'), lines=True) label_list = list(all_labels.title) label_ids = list(all_labels.uid) label_data = SimpleDataset(label_list, transform=tokenizer.encode) # label dataloader for searching sampler = SequentialSampler(label_data) label_padding_func = lambda x: padding_util(x, tokenizer.pad_token_id, 64) label_dataloader = DataLoader(label_data, sampler=sampler, batch_size=16, collate_fn=label_padding_func) # label dataloader for regularization reg_sampler = RandomSampler(label_data) reg_dataloader = DataLoader(label_data, sampler=reg_sampler, batch_size=4, collate_fn=label_padding_func) if args.mode == 'ict': train_data = ICTXMCDataset(tokenizer=tokenizer, dataset=args.dataset) elif args.mode == 'self-train': train_data = PosDataset(tokenizer=tokenizer, dataset=args.dataset, labels=label_list, mode=args.mode) elif args.mode == 'finetune-pair': train_path = os.path.join(data_path, 'trn.json') pos_pair = [] with open(train_path) as fp: for i, line in enumerate(fp): inst = json.loads(line.strip()) inst_id = inst['uid'] for ind in inst['target_ind']: pos_pair.append((inst_id, ind, i)) dataset_size = len(pos_pair) indices = list(range(dataset_size)) split = int(np.floor(args.ratio * dataset_size)) np.random.shuffle(indices) train_indices = indices[:split] torch.distributed.broadcast_object_list(train_indices, src=0, group=None) sample_pairs = [pos_pair[i] for i in train_indices] train_data = PosDataset(tokenizer=tokenizer, dataset=args.dataset, labels=label_list, mode=args.mode, sample_pairs=sample_pairs) elif args.mode == 'finetune-label': label_index = [] label_path = os.path.join(data_path, 'label_index.json') with open(label_path) as fp: for line in fp: label_index.append(json.loads(line.strip())) np.random.shuffle(label_index) sample_size = int(np.floor(args.ratio * len(label_index))) sample_label = label_index[:sample_size] torch.distributed.broadcast_object_list(sample_label, src=0, group=None) sample_pairs = [] for i, label in enumerate(sample_label): ind = label['ind'] for inst_id in label['instance']: sample_pairs.append((inst_id, ind, i)) train_data = PosDataset(tokenizer=tokenizer, dataset=args.dataset, labels=label_list, mode=args.mode, sample_pairs=sample_pairs) train_sampler = RandomSampler(train_data) padding_func = lambda x: ICT_batchify(x, tokenizer.pad_token_id, 64, 288) train_dataloader = torch.utils.data.DataLoader( train_data, sampler=train_sampler, batch_size=args.per_device_train_batch_size, num_workers=4, pin_memory=False, collate_fn=padding_func) try: accelerator.print("load cache") all_instances = torch.load( os.path.join(data_path, 'all_passages_with_titles.json.cache.pt')) test_data = SimpleDataset(all_instances.values()) except: all_instances = {} test_path = os.path.join(data_path, 'tst.json') if args.mode == 'ict': train_path = os.path.join(data_path, 'trn.json') train_instances = {} valid_passage_ids = train_data.valid_passage_ids with open(train_path) as fp: for line in fp: inst = json.loads(line.strip()) train_instances[ inst['uid']] = inst['title'] + '\t' + inst['content'] for inst_id in valid_passage_ids: all_instances[inst_id] = train_instances[inst_id] test_ids = [] with open(test_path) as fp: for line in fp: inst = json.loads(line.strip()) all_instances[ inst['uid']] = inst['title'] + '\t' + inst['content'] test_ids.append(inst['uid']) simple_transform = lambda x: tokenizer.encode( x, max_length=288, truncation=True) test_data = SimpleDataset(list(all_instances.values()), transform=simple_transform) inst_num = len(test_data) sampler = SequentialSampler(test_data) sent_padding_func = lambda x: padding_util(x, tokenizer.pad_token_id, 288) instance_dataloader = DataLoader(test_data, sampler=sampler, batch_size=128, collate_fn=sent_padding_func) # prepare pairs reader = csv.reader(open(os.path.join(data_path, 'all_pairs.txt'), encoding="utf-8"), delimiter=" ") qrels = {} for id, row in enumerate(reader): query_id, corpus_id, score = row[0], row[1], int(row[2]) if query_id not in qrels: qrels[query_id] = {corpus_id: score} else: qrels[query_id][corpus_id] = score logging.info("| |ICT_dataset|={} pairs.".format(len(train_data))) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, label_dataloader, reg_dataloader, instance_dataloader = accelerator.prepare( model, optimizer, train_dataloader, label_dataloader, reg_dataloader, instance_dataloader) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) # args.max_train_steps = 100000 args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) args.num_warmup_steps = int(0.1 * args.max_train_steps) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_data)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Learning Rate = {args.learning_rate}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 from torch.cuda.amp import autocast scaler = torch.cuda.amp.GradScaler() cluster_result = eval_and_cluster(args, logger, completed_steps, accelerator.unwrap_model(model), label_dataloader, label_ids, instance_dataloader, inst_num, test_ids, qrels, accelerator) reg_iter = iter(reg_dataloader) trial_name = f"dim-{args.proj_emb_dim}-bs-{args.per_device_train_batch_size}-{args.dataset}-{args.log}-{args.mode}" for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): batch = tuple(t for t in batch) label_tokens, inst_tokens, indices = batch if args.mode == 'ict': try: reg_data = next(reg_iter) except StopIteration: reg_iter = iter(reg_dataloader) reg_data = next(reg_iter) if cluster_result is not None: pseudo_labels = cluster_result[indices] else: pseudo_labels = indices with autocast(): if args.mode == 'ict': label_emb, inst_emb, inst_emb_aug, reg_emb = model( label_tokens, inst_tokens, reg_data) loss, stats_dict = loss_function_reg( label_emb, inst_emb, inst_emb_aug, reg_emb, pseudo_labels, accelerator) else: label_emb, inst_emb = model(label_tokens, inst_tokens, reg_data=None) loss, stats_dict = loss_function(label_emb, inst_emb, pseudo_labels, accelerator) loss = loss / args.gradient_accumulation_steps scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: scaler.step(optimizer) scaler.update() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps % args.logging_steps == 0: if args.mode == 'ict': logger.info( "| Epoch [{:4d}/{:4d}] Step [{:8d}/{:8d}] Total Loss {:.6e} Contrast Loss {:.6e} Reg Loss {:.6e}" .format( epoch, args.num_train_epochs, completed_steps, args.max_train_steps, stats_dict["loss"].item(), stats_dict["contrast_loss"].item(), stats_dict["reg_loss"].item(), )) else: logger.info( "| Epoch [{:4d}/{:4d}] Step [{:8d}/{:8d}] Total Loss {:.6e}" .format( epoch, args.num_train_epochs, completed_steps, args.max_train_steps, stats_dict["loss"].item(), )) if completed_steps % args.eval_steps == 0: cluster_result = eval_and_cluster( args, logger, completed_steps, accelerator.unwrap_model(model), label_dataloader, label_ids, instance_dataloader, inst_num, test_ids, qrels, accelerator) unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.label_encoder.save( f"{args.output_dir}/{trial_name}/label_encoder") unwrapped_model.instance_encoder.save( f"{args.output_dir}/{trial_name}/instance_encoder") if completed_steps >= args.max_train_steps: break
def train_src(encoder, classifier, data_loader, src_data_loader_eval, checkpoints=None, tgt_flag=False): """Train classifier for source domain.""" time_local = time.localtime(time.time()) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) print(dt) #################### # 1. setup network # #################### # set train state for Dropout and BN layers encoder.train() classifier.train() if tgt_flag: not_bert_lr = params.d_learning_rate num_classes = params.tgt_num_classes else: num_classes = params.num_classes not_bert_lr = params.c_learning_rate # setup criterion and optimizer if params.optimizer == 'sgd': optimizer = optim.SGD(list(encoder.parameters()) + list(classifier.parameters()), lr=not_bert_lr, weight_decay=params.weight_decay) elif params.optimizer == 'adamw': from transformers import AdamW named_parameters = list(dict(encoder.named_parameters()).items())+\ list(dict(classifier.named_parameters()).items()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] grouped_params = [{ 'params': [p for n, p in named_parameters if ('bert' not in n)], 'weight_decay': params.weight_decay, 'lr': not_bert_lr, 'ori_lr': not_bert_lr }, { 'params': [ p for n, p in named_parameters if ('bert' in n) and not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01, 'lr': params.bert_learning_rate, 'ori_lr': params.bert_learning_rate }, { 'params': [ p for n, p in named_parameters if ('bert' in n) and any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': params.bert_learning_rate, 'ori_lr': params.bert_learning_rate }] optimizer = AdamW(grouped_params, correct_bias=False) else: optimizer = optim.Adam(list(encoder.parameters()) + list(classifier.parameters()), lr=not_bert_lr, weight_decay=params.weight_decay) if params.warmup_step > 0: from transformers import get_linear_schedule_with_warmup training_steps = params.num_epochs_pre * len(data_loader) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params.warmup_step, num_training_steps=training_steps) else: scheduler = None criterion = nn.CrossEntropyLoss() if checkpoints == None: checkpoints = {} start_epoch = 0 start_step = 0 else: print("load checkpoints!") start_epoch = checkpoints['epoch'] print('start_epoch: ', start_epoch) start_step = checkpoints['step'] print('start_step ', start_step) optimizer.load_state_dict(checkpoints['optimizer']) encoder.load_state_dict(checkpoints['encoder']) classifier.load_state_dict(checkpoints['classifier']) #################### # 2. train network # #################### # start to evaluate from training_acc > 0.5 # and start to save from validation_acc > 0.45 best_epoch = 0 best_f1_ex = 0 e_f1_ex = 0 accuracy_ex = 0 best_encoder = None best_classifier = None # early stop training when the loss in validation set is not decreased for k times early_stopping = EarlyStopping(params.early_stop_patient, verbose=True) for epoch in range(params.num_epochs_pre): if epoch < start_epoch: print('epoch continue: ', epoch) continue print('epochs: ', epoch) correct = 0 total = 0 target_num = torch.zeros((1, num_classes)) predict_num = torch.zeros((1, num_classes)) acc_num = torch.zeros((1, num_classes)) total_loss = 0 # a large learning rate is needed to be decayed if params.c_learning_rate >= 1e-3 and scheduler == None: adjust_learning_rate(optimizer, params.c_learning_rate, decay_rate=.5, epoch=epoch, critic_flag=False) for step, (data, labels) in enumerate(data_loader): if epoch < start_epoch and step < start_step: continue # make data and labels variable # data = make_cuda(data) labels = make_variable(labels) #.squeeze_() # zero gradients for optimizer optimizer.zero_grad() # compute loss for critic preds = classifier(encoder(data)) loss1 = criterion(preds, labels) loss = loss1 # optimize source classifier loss.backward() optimizer.step() if scheduler is not None: scheduler.step() total_loss += loss1 _, predicted = torch.max(preds.data, 1) total += labels.size(0) correct += predicted.eq(labels.data).cpu().sum() pre_mask = torch.zeros(preds.size()).scatter_( 1, predicted.cpu().view(-1, 1), 1.) predict_num += pre_mask.sum(0) tar_mask = torch.zeros(preds.size()).scatter_( 1, labels.data.cpu().view(-1, 1), 1.) target_num += tar_mask.sum(0) acc_mask = pre_mask * tar_mask acc_num += acc_mask.sum(0) # print step info if ((step + 1) % params.log_step_pre == 0): micro_r = acc_num[:, 1:].sum(1) / target_num[:, 1:].sum(1) micro_p = acc_num[:, 1:].sum(1) / predict_num[:, 1:].sum( 1) if predict_num[:, 1:].sum(1) != 0 else 0 micro_f1 = 2 * micro_p * micro_r / ( micro_r + micro_p) if acc_num[:, 1:].sum(1) != 0 else 0 micro_r = (micro_r.numpy()[0] ).round(4) if type(micro_r) != int else micro_r micro_p = (micro_p.numpy()[0] ).round(4) if type(micro_p) != int else micro_p micro_f1 = (micro_f1.numpy()[0] ).round(4) if type(micro_f1) != int else micro_f1 recall = acc_num / target_num precision = acc_num / predict_num F1 = 2 * recall * precision / (recall + precision) accuracy = acc_num.sum(1) / target_num.sum(1) accuracy_ex = acc_num[:, 1:].sum(1) / target_num[:, 1:].sum(1) recall = (recall.numpy()[0]).round(4) precision = (precision.numpy()[0]).round(4) F1 = (F1.numpy()[0]).round(4) accuracy = (accuracy.numpy()[0]).round(4) accuracy_ex = (accuracy_ex.numpy()[0]).round(4) where_are_nan = np.isnan(precision) precision[where_are_nan] = 0 where_are_nan = np.isnan(recall) recall[where_are_nan] = 0 where_are_nan = np.isnan(F1) F1[where_are_nan] = 0 print( "Epoch [{}/{}] Step [{}/{}]: loss={:.5f}, loss_without_L2={:.5f}, accuracy = {:.5f}, \ precision = {:.5f}, recall = {:.5f}, F1 = {}, accuracy(excludeNone) = {:.5f}, \ precision(excludeNone) = {:.5f}, recall(excludeNone) = {:.5f}, F1(excludeNone) = {:.5f} \ micro_precision(excludeNone) = {:.5f}, micro_recall(excludeNone) = {:.5f}, micro_F1(excludeNone) = {:.5f} \n" .format(epoch + 1, params.num_epochs_pre, step + 1, len(data_loader), loss.item(), loss1.item(), accuracy, precision.sum() / num_classes, recall.sum() / num_classes, F1.sum() / num_classes, accuracy_ex, precision[1:].sum() / (num_classes - 1), recall[1:].sum() / (num_classes - 1), F1[1:].sum() / (num_classes - 1), micro_p, micro_r, micro_f1)) print("Epoch [{}/{}]: total_loss:{}".format( epoch + 1, params.num_epochs_shared, total_loss / len(data_loader))) # eval model on test set if epoch>0 and \ ((epoch + 1) % params.eval_step_pre == 0): #e_f1_ex, e_loss = eval_src(encoder, classifier, # data_loader, log_flag=False, # tgt_flag=tgt_flag) e_f1_ex, e_loss = eval_src(encoder, classifier, src_data_loader_eval, log_flag=False, tgt_flag=tgt_flag) early_stopping(-e_f1_ex) #e_loss # 若满足 early stopping 要求 if early_stopping.early_stop: print("Early stopping") # 结束模型训练 break # save model parameters if epoch>0 and \ e_f1_ex > best_f1_ex and \ ((epoch + 1) % params.save_step_pre == 0): best_f1_ex = e_f1_ex best_epoch = epoch + 1 best_encoder = dict([(k, v.clone()) for (k, v) in encoder.state_dict().items()]) best_classifier = dict([ (k, v.clone()) for (k, v) in classifier.state_dict().items() ]) save_model( encoder, "{}-source-encoder-{}.pt".format(params.model_name, epoch + 1)) save_model( classifier, "{}-source-classifier-{}.pt".format(params.model_name, epoch + 1)) checkpoints['epoch'] = epoch checkpoints['step'] = step checkpoints['optimizer'] = optimizer.state_dict() checkpoints['encoder'] = encoder.state_dict() checkpoints['classifier'] = classifier.state_dict() if tgt_flag: pickle.dump( checkpoints, open( os.path.join(params.model_root, 'checkpoint_adapt.pkl'), 'wb')) else: pickle.dump( checkpoints, open( os.path.join(params.model_root, 'checkpoint_pretrain.pkl'), 'wb')) # # save final model print("update as the best one trained at epoch %s!" % best_epoch) encoder.load_state_dict(best_encoder) classifier.load_state_dict(best_classifier) #print("saving the best...") #pickle.dump(best_encoder, # open(os.path.join(params.model_root, # 'best_encoder.pkl'),'wb')) #pickle.dump(best_classifier, # open(os.path.join(params.model_root, # 'best_classifier.pkl'),'wb')) print("saving...") save_model(encoder, "{}-source-encoder-final.pt".format(params.model_name)) save_model(classifier, "{}-source-classifier-final.pt".format(params.model_name)) #checkpoints['epoch'] = epoch #checkpoints['step'] = step #checkpoints['optimizer'] = optimizer.state_dict() #checkpoints['encoder'] = encoder.state_dict() #checkpoints['classifier'] = classifier.state_dict() #pickle.dump(checkpoints, # open(os.path.join(params.model_root, # 'checkpoint_pretrain.pkl'),'wb')) time_local = time.localtime(time.time()) dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local) print(dt) return encoder, classifier
def train(config, batchmanager, model): """Main training loop Parameters: config: argparse flags batchmanager (MultiNLIBatchManager): indeed, the batchmanager model (FineTunedBERT): the model to train Returns: (dict, float): the state dictionary of the best model and its dev accuracy""" model.train() # loss criterion = torch.nn.CrossEntropyLoss() # filter out from the optimizer the "frozen" parameters, # which are the parameters without requires grad. optimizer = AdamW(model.parameters(), lr=config.lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=len(batchmanager.train_iter) * config.epochs) # compute initial dev accuracy (to check whether it is 1/n_classes) last_dev_acc = get_accuracy(model, batchmanager.dev_iter) best_dev_acc = last_dev_acc # to save the best model best_model_dict = deepcopy(model.state_dict()) # to save the best model print(f'inital dev accuracy: {last_dev_acc}', flush=True) try: for epoch in range(config.epochs): loss_c = 0. for i, batch in enumerate(batchmanager.train_iter): optimizer.zero_grad() data, targets = batch out = model(data) loss = criterion(out, targets) loss.backward() loss_c += loss.item() if i != 0 and i % config.loss_print_rate == 0: print( f'epoch #{epoch+1}/{config.epochs}, batch #{i}/{len(batchmanager.train_iter)}: loss = {loss.item()}', flush=True) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() # update lr # end of an epoch print(f'#####\nEpoch {epoch+1} concluded!\n') print( f'Average train loss: {loss_c / len(batchmanager.train_iter)}') print( f'Average train acc : {get_accuracy(model, batchmanager.train_iter)}' ) new_dev_acc = get_accuracy(model, batchmanager.dev_iter) last_dev_acc = new_dev_acc print(f'dev accuracy: {new_dev_acc}') print('#####', flush=True) # if it improves, this is the best model if new_dev_acc > best_dev_acc: best_dev_acc = new_dev_acc best_model_dict = deepcopy(model.state_dict()) except KeyboardInterrupt: print("Training stopped!") new_dev_acc = get_accuracy(model, batchmanager.dev_iter) print(f'Recomputing dev accuracy: {new_dev_acc}') if new_dev_acc > best_dev_acc: best_dev_acc = new_dev_acc best_model_dict = deepcopy(model.state_dict()) test_acc = get_accuracy(model, batchmanager.test_iter) print(f"TEST ACCURACY: {test_acc}") return best_model_dict, best_dev_acc
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].select(range(100)) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" label_column_name = "label" if "label" in column_names else "labels" # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.model_name_or_path) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMultipleChoice.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [ [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) ] labels = examples[label_column_name] # Flatten out first_sentences = sum(first_sentences, []) second_sentences = sum(second_sentences, []) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, max_length=args.max_length, padding=padding, truncation=True, ) # Un-flatten tokenized_inputs = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} tokenized_inputs["labels"] = labels return tokenized_inputs processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForMultipleChoice( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None) ) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Metrics metric = load_metric("accuracy") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() accelerator.print(f"epoch {epoch}: {eval_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
class AGEM(Learner): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.lr = config.learner.lr self.n_epochs = config.training.epochs self.model = TransformerClsModel(model_name=config.learner.model_name, n_classes=config.data.n_classes, max_length=config.data.max_length, device=self.device) self.memory = ReplayMemory(write_prob=self.write_prob, tuple_size=2) self.logger.info("Loaded {} as model".format( self.model.__class__.__name__)) self.loss_fn = nn.CrossEntropyLoss() self.optimizer = AdamW( [p for p in self.model.parameters() if p.requires_grad], lr=self.lr) def training(self, datasets, **kwargs): train_datasets = data.ConcatDataset(datasets["train"]) dataloaders = { "train": data.DataLoader(train_datasets, batch_size=self.mini_batch_size, shuffle=False, collate_fn=batch_encode), } self.train(dataloaders=dataloaders) def train(self, dataloaders): self.model.train() dataloader = dataloaders["train"] data_length = len(dataloader) * self.n_epochs for epoch in range(self.n_epochs): all_losses, all_predictions, all_labels = [], [], [] for text, labels in dataloader: labels = torch.tensor(labels).to(self.device) input_dict = self.model.encode_text(text) output = self.model(input_dict) loss = self.loss_fn(output, labels) self.update_parameters(loss, mini_batch_size=len(labels)) loss = loss.item() pred = model_utils.make_prediction(output.detach()) all_losses.append(loss) all_predictions.extend(pred.tolist()) all_labels.extend(labels.tolist()) self.memory.write_batch(text, labels) if self.current_iter % self.log_freq == 0: self.write_log(all_predictions, all_labels, all_losses, data_length=data_length) self.start_time = time.time() # time from last log all_losses, all_predictions, all_labels = [], [], [] # if self.current_iter % self.config.training.save_freq == 0: self.time_checkpoint() self.current_iter += 1 self.current_epoch += 1 def update_parameters(self, loss, mini_batch_size): """Update parameters of model""" self.optimizer.zero_grad() params = [p for p in self.model.parameters() if p.requires_grad] orig_grad = torch.autograd.grad(loss, params) replay_freq = self.replay_every // mini_batch_size replay_steps = int(self.replay_every * self.replay_rate / mini_batch_size) if self.replay_rate != 0 and (self.current_iter + 1) % replay_freq == 0: ref_grad_sum = None for _ in range(replay_steps): ref_text, ref_labels = self.memory.read_batch( batch_size=mini_batch_size) ref_labels = torch.tensor(ref_labels).to(self.device) ref_input_dict = self.model.encode_text(ref_text) ref_output = self.model(ref_input_dict) ref_loss = self.loss_fn(ref_output, ref_labels) ref_grad = torch.autograd.grad(ref_loss, params) if ref_grad_sum is None: ref_grad_sum = ref_grad else: ref_grad_sum = [ x + y for (x, y) in zip(ref_grad, ref_grad_sum) ] final_grad = self.compute_grad(orig_grad, ref_grad_sum) else: final_grad = orig_grad for param, grad in zip(params, final_grad): param.grad = grad.data self.optimizer.step() def write_log(self, all_predictions, all_labels, all_losses, data_length): acc, prec, rec, f1 = model_utils.calculate_metrics( all_predictions, all_labels) time_per_iteration, estimated_time_left = self.time_metrics( data_length) self.logger.info( "Iteration {}/{} ({:.2f}%) -- {:.3f} (sec/it) -- Time Left: {}\nMetrics: Loss = {:.4f}, accuracy = {:.4f}, precision = {:.4f}, recall = {:.4f}, " "F1 score = {:.4f}".format(self.current_iter + 1, data_length, (self.current_iter + 1) / data_length * 100, time_per_iteration, estimated_time_left, np.mean(all_losses), acc, prec, rec, f1)) if self.config.wandb: n_examples_seen = (self.current_iter + 1) * self.mini_batch_size wandb.log({ "accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "loss": np.mean(all_losses), "examples_seen": n_examples_seen }) def evaluate(self, dataloader): all_losses, all_predictions, all_labels = [], [], [] self.set_eval() for i, (text, labels) in enumerate(dataloader): labels = torch.tensor(labels).to(self.device) input_dict = self.model.encode_text(text) with torch.no_grad(): output = self.model(input_dict) loss = self.loss_fn(output, labels) loss = loss.item() pred = model_utils.make_prediction(output.detach()) all_losses.append(loss) all_predictions.extend(pred.tolist()) all_labels.extend(labels.tolist()) if i % 20 == 0: self.logger.info(f"Batch {i + 1}/{len(dataloader)} processed") acc, prec, rec, f1 = model_utils.calculate_metrics( all_predictions, all_labels) self.logger.info( "Test metrics: Loss = {:.4f}, accuracy = {:.4f}, precision = {:.4f}, recall = {:.4f}, " "F1 score = {:.4f}".format(np.mean(all_losses), acc, prec, rec, f1)) return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1} def compute_grad(self, orig_grad, ref_grad): """Computes gradient according to the AGEM method""" with torch.no_grad(): flat_orig_grad = torch.cat([torch.flatten(x) for x in orig_grad]) flat_ref_grad = torch.cat([torch.flatten(x) for x in ref_grad]) dot_product = torch.dot(flat_orig_grad, flat_ref_grad) if dot_product >= 0: return orig_grad proj_component = dot_product / torch.dot(flat_ref_grad, flat_ref_grad) modified_grad = [ o - proj_component * r for (o, r) in zip(orig_grad, ref_grad) ] return modified_grad
class NewsClassifierTrainer: def __init__(self, *, epochs: int, n_samples: int, vocab_file_url: str, is_save_model: bool, model_path: str, batch_size: int = 16, max_len: int = 160): self.model_path = model_path self.is_save_model = is_save_model self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.epochs = epochs self.batch_size = batch_size self.max_len = max_len self.n_samples = n_samples self.vocab_file_url = vocab_file_url self.vocab_file = "bert_base_uncased_vocab.txt" self.df = None self.tokenizer = None self.df_train = None self.df_val = None self.df_test = None self.train_data_loader = None self.val_data_loader = None self.test_data_loader = None self.optimizer = None self.total_steps = None self.scheduler = None self.loss_fn = None @staticmethod def process_label(rating): rating = int(rating) return rating - 1 @staticmethod def create_data_loader(df, tokenizer, max_len, batch_size): """ :param df: DataFrame input :param tokenizer: Bert tokenizer :param max_len: maximum length of the input sentence :param batch_size: Input batch size :return: output - Corresponding data loader for the given input """ ds = AGNewsDataset( reviews=df.description.to_numpy(), targets=df.label.to_numpy(), tokenizer=tokenizer, max_len=max_len, ) return DataLoader(ds, batch_size=batch_size, num_workers=4) def prepare_data(self): """ Creates train, valid and test data loaders from the csv data """ dataset_tar = download_from_url(URLS["AG_NEWS"], root=".data") extracted_files = extract_archive(dataset_tar) train_csv_path = None for file_name in extracted_files: if file_name.endswith("train.csv"): train_csv_path = file_name self.df = pd.read_csv(train_csv_path) self.df.columns = ["label", "title", "description"] self.df.sample(frac=1) self.df = self.df.iloc[:self.n_samples] self.df["label"] = self.df.label.apply(self.process_label) if not os.path.isfile(self.vocab_file): file_pointer = requests.get(self.vocab_file_url, allow_redirects=True) if file_pointer.ok: with open(self.vocab_file, "wb") as f: f.write(file_pointer.content) else: raise RuntimeError("Error in fetching the vocab file") self.tokenizer = BertTokenizer(self.vocab_file) np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) self.df_train, self.df_test = train_test_split( self.df, test_size=0.1, random_state=RANDOM_SEED, stratify=self.df["label"]) self.df_val, self.df_test = train_test_split( self.df_test, test_size=0.5, random_state=RANDOM_SEED, stratify=self.df_test["label"]) self.train_data_loader = self.create_data_loader( self.df_train, self.tokenizer, self.max_len, self.batch_size) self.val_data_loader = self.create_data_loader(self.df_val, self.tokenizer, self.max_len, self.batch_size) self.test_data_loader = self.create_data_loader( self.df_test, self.tokenizer, self.max_len, self.batch_size) def set_optimizer(self, model): """ Sets the optimizer and scheduler functions """ self.optimizer = AdamW(model.parameters(), lr=1e-3, correct_bias=False) self.total_steps = len(self.train_data_loader) * self.epochs self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=0, num_training_steps=self.total_steps) self.loss_fn = nn.CrossEntropyLoss().to(self.device) def save_model(self, model): with monit.section('Save model'): mlflow.pytorch.log_model(model, "bert-model", registered_model_name="BertModel", extra_files=[ "class_mapping.json", "bert_base_uncased_vocab.txt", "src/bert_classifier/train.py", "src/bert_classifier/handler.py", ]) def start_training(self, model): """ Initializes the Training step with the model initialized :param model: Instance of the NewsClassifier class """ best_loss = float('inf') for epoch in monit.loop(self.epochs): with tracker.namespace('train'): self.train_epoch(model, self.train_data_loader, 'train') with tracker.namespace('valid'): _, val_loss = self.train_epoch(model, self.val_data_loader, 'valid') if val_loss < best_loss: best_loss = val_loss if self.is_save_model: self.save_model(model) tracker.new_line() def train_epoch(self, model: nn.Module, data_loader: DataLoader, name: str): """ Train/Validate for an epoch """ model.train(name == 'train') correct_predictions = 0 total = 0 total_loss = 0 with torch.set_grad_enabled(name == 'train'): for i, data in monit.enum(name, data_loader): input_ids = data["input_ids"].to(self.device) attention_mask = data["attention_mask"].to(self.device) targets = data["targets"].to(self.device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) _, preds = torch.max(outputs, dim=1) loss = self.loss_fn(outputs, targets) total_loss += loss.item() * len(preds) correct_predictions += torch.sum(preds == targets).item() total += len(preds) tracker.add('loss.', loss) if name == 'train': tracker.add_global_step(len(preds)) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) self.optimizer.step() self.optimizer.zero_grad() if (i + 1) % 10 == 0: tracker.save() tracker.save('accuracy.', correct_predictions / total) mlflow.log_metric(f"{name}_acc", float(correct_predictions / total), step=tracker.get_global_step()) mlflow.log_metric(f"{name}_loss", float(total_loss / total), step=tracker.get_global_step()) return correct_predictions / total, total_loss / total def get_predictions(self, model, data_loader): """ Prediction after the training step is over :param model: Instance of the NewsClassifier class :param data_loader: Data loader for either test / validation dataset :result: output - Returns prediction results, prediction probablities and corresponding values """ model = model.eval() review_texts = [] predictions = [] prediction_probs = [] real_values = [] with torch.no_grad(): for d in data_loader: texts = d["review_text"] input_ids = d["input_ids"].to(self.device) attention_mask = d["attention_mask"].to(self.device) targets = d["targets"].to(self.device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) _, preds = torch.max(outputs, dim=1) probs = F.softmax(outputs, dim=1) review_texts.extend(texts) predictions.extend(preds) prediction_probs.extend(probs) real_values.extend(targets) predictions = torch.stack(predictions).cpu() prediction_probs = torch.stack(prediction_probs).cpu() real_values = torch.stack(real_values).cpu() return review_texts, predictions, prediction_probs, real_values
def train(tokenizer, config, args, train_data_set, eval_data_set=None): # 获取数据 num_train_optimization_steps = int( len(train_data_set) / args.train_batch_size) * args.epochs train_data_loader = DataLoader(dataset=train_data_set, batch_size=args.train_batch_size, collate_fn=data_helper.collate_fn) # 构建模型 steps = 0 biaffine_model = biaffine_ner.Biaffine_NER.from_pretrained(args.init_checkpoint, config=config) # if args.load_path is not None: # biaffine_model.load_state_dict(torch.load(args.load_path)) # logging.info("Checkpoint: %s have been loaded!" % args.load_path) if args.do_adv: fgm_model = fgm.FGM(biaffine_model) # 定义对抗训练模型 if args.use_cuda: # model = nn.DataParallel(model) biaffine_model.cuda() # prepare optimizer parameters_to_optimize = list(biaffine_model.parameters()) optimizer = AdamW(parameters_to_optimize, lr=args.learning_rate) warmup_step = num_train_optimization_steps * args.warmup_proportion scheduler = get_linear_schedule_with_warmup( optimizer, warmup_step, num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data_set)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) log_loss = 0.0 best_f1 = 0.0 biaffine_loss_fc = nn.CrossEntropyLoss(reduce=False) begin_time = time.time() biaffine_model.train() for epoch in range(args.epochs): for batch in train_data_loader: steps += 1 if args.use_cuda: batch = data_to_cuda(batch) loss = batch_forward(batch, biaffine_model, biaffine_loss_fc)[0] loss.backward() # 对抗训练 if args.do_adv: fgm_model.attack() loss_adv = batch_forward(batch, biaffine_model, biaffine_loss_fc)[0] loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm_model.restore() # 恢复embedding参数 optimizer.step() scheduler.step() optimizer.zero_grad() log_loss += loss.data.item() if steps % args.log_steps == 0: end_time = time.time() used_time = end_time - begin_time logger.info( "epoch: %d, progress: %d/%d, ave loss: %f, speed: %f s/step" % ( epoch, steps, num_train_optimization_steps, loss, # log_loss / args.log_steps, used_time / args.log_steps, ), ) begin_time = time.time() log_loss = 0.0 if args.do_eval and steps % args.eval_step == 0: eval_f1 = evaluate(args, eval_data_set, biaffine_model) if eval_f1 > best_f1: logging.info('save model: %s' % os.path.join( args.save_path, 'model_%d.bin' % epoch)) torch.save(biaffine_model.state_dict(), os.path.join(args.save_path, 'model_best.bin')) best_f1 = eval_f1 logging.info('best f1: %.4f' % best_f1) logging.info('final best f1: %.4f' % best_f1)
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called # 'tokens' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].select(range(100)) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names features = raw_datasets["train"].features else: column_names = raw_datasets["validation"].column_names features = raw_datasets["validation"].features if args.text_column_name is not None: text_column_name = args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if args.label_column_name is not None: label_column_name = args.label_column_name elif f"{args.task_name}_tags" in column_names: label_column_name = f"{args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(raw_datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=True) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForTokenClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForTokenClassification.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the raw_datasets. # First we tokenize all the texts. padding = "max_length" if args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], max_length=args.max_length, padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if args. label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs processed_raw_datasets = raw_datasets.map( tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names) train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Metrics metric = load_metric("seqeval") def get_labels(predictions, references): # Transform predictions and references tensos to numpy arrays if device.type == "cpu": y_pred = predictions.detach().clone().numpy() y_true = references.detach().clone().numpy() else: y_pred = predictions.detach().cpu().clone().numpy() y_true = references.detach().cpu().clone().numpy() # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] true_labels = [[ label_list[l] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] return true_predictions, true_labels def compute_metrics(): results = metric.compute() if args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) labels = batch["labels"] if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100) labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100) predictions_gathered = accelerator.gather(predictions) labels_gathered = accelerator.gather(labels) preds, refs = get_labels(predictions_gathered, labels_gathered) metric.add_batch( predictions=preds, references=refs, ) # predictions and preferences are expected to be a nested list of labels, not label_ids # eval_metric = metric.compute() eval_metric = compute_metrics() accelerator.print(f"epoch {epoch}:", eval_metric) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def train_func(config: Dict[str, Any]): args = config["args"] # Initialize the accelerator. We will let the accelerator handle device # placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on # the screen. accelerator.is_local_main_process is only True for one # process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON training and # evaluation files (see below) or specify a GLUE benchmark task (the # dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use as labels the column called # 'label' and as pair of sentences the sentences in columns called # 'sentence1' and 'sentence2' if such column exists or the first two # columns not named label if at least two columns are provided. # If the CSVs/JSONs contain only one non-label column, the script does # single sentence classification on this single column. You can easily # tweak this behavior (see below) # In distributed training, the load_dataset function guarantee that only # one local process can concurrently download the dataset. if args.task_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset("glue", args.task_name) else: # Loading the dataset from local csv or json file. data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = (args.train_file if args.train_file is not None else args.valid_file).split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if args.task_name is not None: is_regression = args.task_name == "stsb" if not is_regression: label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your # needs. is_regression = raw_datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique # noqa:E501 label_list = raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that # only one local process can concurrently download model & vocab. config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) # Preprocessing the datasets if args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to # tweak to your use case. non_label_column_names = [ name for name in raw_datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and \ "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Some models have set the order of the labels to use, # so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and args.task_name is not None and not is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list( # noqa:C413 sorted(label_list)): # noqa:C413 logger.info( f"The configuration of the model provided the following label " f"correspondence: {label_name_to_id}. Using it!") label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, " "but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, " # noqa:C413,E501 f"dataset labels: {list(sorted(label_list))}." # noqa:C413 "\nIgnoring the model labels as a result.", ) elif args.task_name is None: label_to_id = {v: i for i, v in enumerate(label_list)} if label_to_id is not None: model.config.label2id = label_to_id model.config.id2label = { id: label for label, id in config.label2id.items() } padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): # Tokenize the texts texts = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True) if "label" in examples: if label_to_id is not None: # Map labels to IDs (not necessary for GLUE tasks) result["labels"] = \ [label_to_id[l] for l in examples["label"]] # noqa:E741 else: # In all cases, rename the column to labels because the model # will expect that. result["labels"] = examples["label"] return result processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data # collator that will just convert everything to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for # us (by padding to the maximum length of the samples passed). When # using mixed precision, we add `pad_to_multiple_of=8` to pad all # tensors to multiple of 8s, which will enable the use of Tensor # Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab # his length below (cause its length will be shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * \ num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Get the metric function if args.task_name is not None: metric = load_metric("glue", args.task_name) else: metric = load_metric("accuracy") # Train! total_batch_size = \ args.per_device_train_batch_size * \ accelerator.num_processes * \ args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device =" f" {args.per_device_train_batch_size}") logger.info( f" Total train batch size (w. parallel, distributed & accumulation) " f"= {total_batch_size}") logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax( dim=-1) if not is_regression else outputs.logits.squeeze() metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"epoch {epoch}: {eval_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if args.task_name == "mnli": # Final evaluation on mismatched validation set eval_dataset = processed_datasets["validation_mismatched"] eval_dataloader = DataLoader( eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) eval_dataloader = accelerator.prepare(eval_dataloader) model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"mnli-mm: {eval_metric}")
def train_model(preprocessor, base_model, frac_train_data, frac_val_data, batch_size=8, n_epoch=10, log_every=1, eval_every=10, save_every=300, checkpoint_fn=None, force_cpu=False, save_model_prefix="" ) -> None: """ Fine-tunes transformer model with custom head on custom data. Parameters ---------- preprocessor (SquadPreprocessor, SquadPlausibleAnswersPreprocessor) - pre-processor class. base_model (nn.Module)- model class, sub-class of nn.Module. frac_train_data (float) - fraction of training data to sample randomly. Useful with limited memory. frac_val_data (float) - fraction of validation data to sample randomly. batch_size (int) - batch size for training. n_epoch (int) - number of epochs for training. log_every (int) - steps frequency to print training loss. eval_every (int) - steps frequency to print eval loss. save_every (int) - steps frequency to save checkpoint. checkpoint_fn (None or str) - if str, uses as filename to load a checkpoint model, to continue training. force_cpu - forces CPU, even on systems with detectable CUDA. Useful for old CUDA architectures, which aren't supported anymore save_model_prefix (str) - prefix to save the model checkpoint """ sp = preprocessor() train_enc, val_enc = sp.get_encodings(random_sample_train=frac_train_data, random_sample_val=frac_val_data, return_tensors="pt") train_ds = SquadDataset(train_enc) val_ds = SquadDataset(val_enc) train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True) eval_dl = DataLoader(val_ds, batch_size=64, shuffle=True) dbm = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True) # Freeze all parameters of the DistilBert # for name, param in dbm.named_parameters(): # if name.startswith('embeddings'): # param.requires_grad = False if force_cpu: device = torch.device("cpu") else: device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # torch.device("cpu") epoch = 0 train_iter = 0 loss_eval = 1000 if checkpoint_fn is not None: checkpoint = torch.load(checkpoint_fn, map_location=device) epoch = checkpoint['epoch'] - 1.0 train_iter = checkpoint['train_iter'] else: checkpoint = None model = base_model(transformer_model=dbm, device=device) if checkpoint: model.load_state_dict(checkpoint['model_state_dict']) # optimizer = torch.optim.Adam(model.parameters(), lr = 0.0002) logging.info(f"Using device: {device}") model.to(device) model.train() optimizer = AdamW(model.parameters(), lr=5e-5) # torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9) if checkpoint: optimizer.load_state_dict(checkpoint['optimizer_state_dict']) while epoch < n_epoch: epoch += 1 for train_data in train_dl: train_iter += 1 optimizer.zero_grad() model_out = model(train_data) loss = model.compute_loss(*model_out) loss.backward() optimizer.step() if train_iter % log_every == 0: print('Train: Epoch: %d, iter: %d, avg. loss: %.2f' % (epoch, train_iter, loss)) if train_iter % eval_every == 0: with torch.no_grad(): # Disable gradient tracking for evaluation model.eval() eval_data = next(iter(eval_dl)) model_out = model(eval_data) loss_eval = model.compute_loss(*model_out) print('\nEval: Epoch: %d, iter: %d, avg. loss: %.2f\n' % (epoch, train_iter, loss_eval)) model.train() if train_iter % save_every == 0: model.save(f"model_checkpoint/{save_model_prefix}_model_{train_iter}.pt", train_iter=train_iter, epoch=epoch, optimizer=optimizer, train_loss=loss, eval_loss=loss_eval)
def __train_finetune_classifier__(self, train_dataset, test_dataset, do_eval=True): # set train mode self.pretrained_model.train() self.classifier.train() # prepare optimizer batch_num = math.ceil(len(train_dataset.labels) / self.bsz) train_loader = DataLoader(train_dataset, batch_size=self.bsz, shuffle=True) params = [{ 'params': self.pretrained_model.parameters() }, { 'params': self.classifier.parameters() }] optimizer = AdamW(params, lr=self.learning_rate) scheduler = ExponentialLR(optimizer, self.lr_decay) # train best_acc = 0 epochs = self.epoch if do_eval else 1 for epoch in range(epochs): pbar = tqdm(range(batch_num)) losses = [] for batch in train_loader: optimizer.zero_grad() input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) labels = batch['labels'].to(self.device) outputs = self.pretrained_model(input_ids, attention_mask=attention_mask) pooler = outputs[1] outputs = self.classifier(pooler) loss = self.loss_func(outputs, labels) if self.n_gpu > 1: loss = loss.mean() loss.backward() optimizer.step() pbar.update() losses.append(loss.data.cpu()) descrip = 'Train epoch:%3d Loss:%6.3f' % (epoch, loss.data.cpu()) if not do_eval: descrip = 'Loss:%6.3f' % loss.data.cpu() pbar.set_description(descrip) scheduler.step() # set average epoch description avg_loss = torch.mean(torch.tensor(losses)) final_descrip = 'Epoch:%2d Average Loss:%6.3f' % (epoch, avg_loss) if not do_eval: final_descrip = 'Average Loss:%6.3f' % avg_loss pbar.set_description(final_descrip) pbar.close() # eval for epochs if (epoch % self.eval_epoch == 0) and do_eval: test_acc = self.eval(test_dataset) best_acc = max(test_acc, best_acc) self.pretrained_model.train() self.classifier.train() return best_acc
class KeyphraseSpanExtraction(object): def __init__(self, args, state_dict=None): self.args = args self.updates = 0 model_config = BertConfig.from_pretrained(args.cache_dir, num_labels=args.num_labels) self.network = BertForSeqTagging.from_pretrained(args.cache_dir, config=model_config) if state_dict is not None: self.network.load_state_dict(state_dict) logger.info('loaded pretrain model state_dict') # ------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------- def init_optimizer(self, num_total_steps): num_warmup_steps = int(self.args.warmup_proportion * num_total_steps) logger.info('warmup steps : %d' % num_warmup_steps) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(self.network.named_parameters()) optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] self.optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, correct_bias=False) self.scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # ------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------- def update(self, step, batch): # Train mode self.network.train() # Transfer to GPU inputs = [b.to(self.args.device) for b in batch[:5]] # run ! loss = self.network(*inputs) if self.args.n_gpu > 1: # mean() to average on multi-gpu parallel (not distributed) training loss = loss.mean() if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps if self.args.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.args.max_grad_norm) if (step + 1) % self.args.gradient_accumulation_steps == 0: self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() self.updates += 1 return loss.item() def predict(self, batch): self.network.eval() inputs = [b.to(self.args.device) for b in batch[:5]] with torch.no_grad(): loss = self.network(*inputs) if self.args.n_gpu > 1: loss = loss.mean() return loss.item() def test(self, batch): self.network.eval() inputs = [b.to(self.args.device) for b in batch[:4]] with torch.no_grad(): logits = self.network(*inputs) logits = F.softmax(logits, dim=-1) logits = logits.data.cpu() return self.decode(logits, batch[4]) @staticmethod def decode(logits, lengths): assert logits.size(0) == sum(lengths) logits = logits.tolist() logit_list = [] sum_len = 0 for l in lengths: logit_list.append(logits[sum_len:sum_len+l]) sum_len += l return logit_list # ------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------- def save_checkpoint(self, filename, epoch): network = self.network.module if hasattr(self.network, 'module') else self.network params = { 'args': self.args, 'epoch': epoch, 'state_dict': network.state_dict(), } try: torch.save(params, filename) logger.info('success save epoch_%d checkpoints !' % epoch) except BaseException: logger.warning('WARN: Saving failed... continuing anyway.') @staticmethod def load_checkpoint(filename, new_args=None): logger.info('Loading model %s' % filename) saved_params = torch.load(filename, map_location=lambda storage, loc:storage) args = saved_params['args'] epoch = saved_params['epoch'] state_dict = saved_params['state_dict'] if new_args: args = override_args(args, new_args) model = KeyphraseSpanExtraction(args, state_dict) logger.info('success loaded epoch_%d checkpoints !' % epoch) return model # ------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------- def zero_grad(self): self.optimizer.zero_grad() # self.network.zero_grad() def set_device(self): self.network.to(self.args.device) def parallelize(self): """Use data parallel to copy the model across several gpus. This will take all gpus visible with CUDA_VISIBLE_DEVICES. """ self.parallel = True self.network = torch.nn.DataParallel(self.network) def distribute(self): self.distributed = True self.network = torch.nn.parallel.DistributedDataParallel(self.network, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True)
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", ) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{args.validation_split_percentage}%:]", ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForCausalLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset", ) if args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 else: if args.block_size > tokenizer.model_max_length: logger.warning( f"The block_size passed ({args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= block_size: total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i:i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, desc=f"Grouping texts in chunks of {block_size}", ) train_dataset = lm_datasets["train"] eval_dataset = lm_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties. if accelerator.distributed_type == DistributedType.TPU: model.tie_weights() # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append( accelerator.gather(loss.repeat( args.per_device_eval_batch_size))) losses = torch.cat(losses) losses = losses[:len(eval_dataset)] try: perplexity = math.exp(torch.mean(losses)) except OverflowError: perplexity = float("inf") logger.info(f"epoch {epoch}: perplexity: {perplexity}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
loss_function = nn.CrossEntropyLoss(weight=class_weights, reduction='mean') # Dataloaders train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False) for epoch in range(100): train_preds = [] train_labels = [] total_train_loss = 0 model.train() print("==========================================================") print("Epoch {}".format(epoch)) print("Train") for batch in tqdm(train_loader): optimizer.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) if loss_weighted: loss = loss_function(outputs[1], labels) else: loss = outputs[0] loss.backward() optimizer.step() for logits in outputs[1].detach().cpu().numpy(): train_preds.append(np.argmax(logits))
def dense_layer(train_dataloader, train_orig_loader, test_dataloader, embed_dim, lr, log_file, epoch_num, cross_entropy=False, optim='sgd', betas=(0.9, 0.999), weight_decay=0.01, warmup_steps=2000): with open(log_file, "w+") as f: f.write( "EPOCH,MODE, AVG LOSS, TOTAL CORRECT, TOTAL ELEMENTS, ACCURACY, AUC, AUPR, TOTAL POSITIVE CORRECT, TOTAL POSITIVE, ACCURACY\n" ) if cross_entropy: net = CENet(embed_dim) criterion = nn.CrossEntropyLoss() else: net = Net(embed_dim) criterion = nn.MSELoss() optim_scheduler = None if optim == 'adam': # Setting the Adam optimizer with hyper-param print("USING ADAM OPTIMIZER, LR: {}".format(lr)) optimizer = AdamW(net.parameters(), lr=lr, betas=betas, weight_decay=weight_decay) optim_scheduler = get_constant_schedule_with_warmup( optimizer, warmup_steps) elif optim == "sgd": print("USING SGD OPTIMIZER, LR: {}".format(lr)) optimizer = SGD(net.parameters(), lr=lr) pdb.set_trace() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") net = net.to(device) for epoch in range(epoch_num): total_train_correct = 0 total_train_samples = 0 total_train_positive_correct = 0 total_train_positive = 0 total_test_correct = 0 total_test_positive_correct = 0 total_test_positive = 0 train_scores = [] train_w_labels = [] test_scores = [] test_labels = [] cumulative_loss = 0 #device = "cpu" # Setting the tqdm progress bar data_iter = tqdm.tqdm(train_dataloader, desc="EP_%s:%d" % ("train", epoch), total=len(train_dataloader), bar_format="{l_bar}{r_bar}") #pdb.set_trace() net.train() for inputs, labels in data_iter: train_w_labels.append(labels) inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad() if cross_entropy: output = net(inputs.float()) loss = criterion(output, labels.squeeze()) predictions = output[:, 1] >= 0.5 train_scores.append(output[:, 1].detach().cpu()) else: output = net(inputs.float()).flatten() loss = criterion(output, labels.float()) predictions = output >= 0.5 train_scores.append(output.detach().cpu()) loss.backward() optimizer.step() if optim_scheduler is not None: optim_scheduler.step() cumulative_loss += loss.item() total_train_correct += torch.sum(predictions == labels).item() total_train_samples += labels.shape[0] positive_inds = labels.nonzero(as_tuple=True) total_train_positive_correct += torch.sum( predictions[positive_inds] == labels[positive_inds]).item() total_train_positive += labels.nonzero().shape[0] auc_score = calc_auc( torch.cat(train_w_labels).numpy(), torch.cat(train_scores).numpy()) aupr_score = calc_aupr( torch.cat(train_w_labels).numpy(), torch.cat(train_scores).numpy()) with open(log_file, "a") as f: f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format( epoch, "train", cumulative_loss / len(train_dataloader), total_train_correct, total_train_samples, total_train_correct / total_train_samples * 100, auc_score, aupr_score, total_train_positive_correct, total_train_positive, total_train_positive_correct / total_train_positive * 100)) cumulative_loss = 0 total_train_correct = 0 total_train_samples = 0 total_train_positive_correct = 0 total_train_positive = 0 train_scores = [] train_labels = [] # Setting the tqdm progress bar data_iter = tqdm.tqdm(train_orig_loader, desc="EP_%s:%d" % ("train_orig", epoch), total=len(train_orig_loader), bar_format="{l_bar}{r_bar}") #pdb.set_trace() net.eval() for inputs, labels in data_iter: train_labels.append(labels) inputs, labels = inputs.to(device), labels.to(device) if cross_entropy: output = net(inputs.float()) loss = criterion(output, labels.squeeze()) predictions = output[:, 1] >= 0.5 train_scores.append(output[:, 1].detach().cpu()) else: output = net(inputs.float()).flatten() loss = criterion(output, labels.float()) predictions = output >= 0.5 train_scores.append(output.detach().cpu()) cumulative_loss += loss.item() total_train_correct += torch.sum(predictions == labels).item() total_train_samples += labels.shape[0] positive_inds = labels.nonzero(as_tuple=True) #pdb.set_trace() total_train_positive_correct += torch.sum( predictions[positive_inds] == labels[positive_inds]).item() total_train_positive += labels.nonzero().shape[0] auc_score = calc_auc( torch.cat(train_labels).numpy(), torch.cat(train_scores).numpy()) aupr_score = calc_aupr( torch.cat(train_labels).numpy(), torch.cat(train_scores).numpy()) with open(log_file, "a") as f: f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format( epoch, "train_orig", cumulative_loss / len(train_dataloader), total_train_correct, total_train_samples, total_train_correct / total_train_samples * 100, auc_score, aupr_score, total_train_positive_correct, total_train_positive, total_train_positive_correct / total_train_positive * 100)) cumulative_loss = 0 total_test_samples = 0 #pdb.set_trace() data_iter = tqdm.tqdm(test_dataloader, desc="EP_%s:%d" % ("test", epoch), total=len(test_dataloader), bar_format="{l_bar}{r_bar}") net.eval() for inputs, labels in data_iter: test_labels.append(labels) inputs, labels = inputs.to(device), labels.to(device) if cross_entropy: output = net(inputs.float()) loss = criterion(output, labels.squeeze()) predictions = output[:, 1] >= 0.5 test_scores.append(output[:, 1].detach().cpu()) else: output = net(inputs.float()).flatten() loss = criterion(output, labels.float()) predictions = output >= 0.5 test_scores.append(output.detach().cpu()) cumulative_loss += loss.item() total_test_samples += labels.shape[0] total_test_correct += torch.sum(predictions == labels).item() positive_inds = labels.nonzero(as_tuple=True) total_test_positive_correct += torch.sum( predictions[positive_inds] == labels[positive_inds]).item() total_test_positive += labels.nonzero().shape[0] auc_score = calc_auc( torch.cat(test_labels).numpy(), torch.cat(test_scores).numpy()) aupr_score = calc_aupr( torch.cat(test_labels).numpy(), torch.cat(test_scores).numpy()) with open(log_file, "a") as f: f.write("{},{},{},{},{},{},{},{},{},{},{}\n".format( epoch, "test", cumulative_loss / len(test_dataloader), total_test_correct, total_test_samples, total_test_correct / total_test_samples * 100, auc_score, aupr_score, total_test_positive_correct, total_test_positive, total_test_positive_correct / total_test_positive * 100))
def train(config, device): set_seed(config) rouge = Rouge() logger = get_logger(config.train_log) logger.info("Building model...") # data reading train_dataset = Dataset(config.train_dataset_file, config, 'train') train_it_num = len(train_dataset) // config.batch_size dev_dataset = Dataset(config.dev_dataset_file, config, 'dev') dev_it_num = len(dev_dataset) // config.val_batch_size with open(config.train_title, 'r') as f: template_txt = f.readlines() with open(config.dev_title, 'r') as f: dev_title = f.readlines() # define and import model # loss_function = TripletLoss(config.margin) model = BERT_rank(1, dropout=config.dropout).to(device) # setting parameter, optimizer and loss function bert_model = list(map(id, model.bert.parameters())) other_params = filter(lambda p: id(p) not in bert_model, model.parameters()) # no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": model.bert.parameters(), "weight_decay": 0.03, 'lr': 1e-5 }, { "params": other_params, "weight_decay": 0.03, 'lr': 1e-2 }, ] optimizer = AdamW(optimizer_grouped_parameters, eps=1e-8) # optimizer = optim.SGD(params=param_optimizer, lr=config.learning_rate) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2000, gamma=0.1) loss_func = torch.nn.MSELoss() if config.n_gpu > 1: model = torch.nn.DataParallel(model) if config.model: model.load_state_dict( torch.load(os.path.join(config.save_dir, config.model))) model.train() # training parameter steps = 0 patience = 0 losses = 0 min_loss = 0 start_time = time.time() valid_losses = 0 # begin training for epoch in range(config.epochs): batches = train_dataset.gen_batches(config.batch_size, shuffle=True) for batch in tqdm(batches, total=train_it_num): optimizer.zero_grad() (input_ids, token_type_ids, attention_mask, tp_idx, scores) = batch # article_input_ids, article_input_mask = article_input_ids.to(device), article_input_mask.to(device) # title_input_ids, title_input_mask = title_input_ids.to(device), title_input_mask.to(device) # template_input_ids, template_input_mask = template_input_ids.to(device), template_input_mask.to(device) input_ids = input_ids.to(device) token_type_ids = token_type_ids.to(device) attention_mask = attention_mask.to(device) scores = scores.to(device) # predicting score # gold_score = model(article_input_ids, article_input_mask, title_input_ids, title_input_mask) # neg_score = model(article_input_ids, article_input_mask, template_input_ids, template_input_mask) # y = torch.ones_like(gold_score) # print(scores.shape) pred = model((input_ids, token_type_ids, attention_mask)) # print(loss.shape) # print(pred.shape) loss = loss_func(scores, pred) if config.n_gpu > 1: loss = loss.mean() losses += loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_clip) optimizer.step() scheduler.step() # testing in validation every few steps if (steps + 1) % config.checkpoint == 0: losses = losses / config.checkpoint logger.info(f"Iteration {steps} train loss {losses}") losses = 0 steps += 1 batches = dev_dataset.gen_batches(config.val_batch_size, shuffle=False) template = [] if isinstance(model, torch.nn.DataParallel): net = model.module net = net.to(device) net.eval() with torch.no_grad(): for batch in tqdm(batches, total=dev_it_num): (input_ids, token_type_ids, attention_mask, tp_idx, scores) = batch # article_input_ids, article_input_mask = article_input_ids.to(device), article_input_mask.to(device) # title_input_ids, title_input_mask = title_input_ids.to(device), title_input_mask.to(device) # template_input_ids, template_input_mask = template_input_ids.to(device), template_input_mask.to(device) input_ids = input_ids.to(device) token_type_ids = token_type_ids.to(device) attention_mask = attention_mask.to(device) scores = scores.to(device) # predicting score # gold_score = model(article_input_ids, article_input_mask, title_input_ids, title_input_mask) # neg_score = model(article_input_ids, article_input_mask, template_input_ids, # template_input_mask) # y = torch.ones_like(gold_score) pair_CLS = net.encode( (input_ids, token_type_ids, attention_mask)) pred = net.scoring(pair_CLS) # loss = loss_func(pred, scores) # if config.n_gpu > 1: # loss = loss.mean() # valid_losses += loss.item() temp_scores = pred.view(-1, config.template_num) # print(temp_scores.shape) _, index = torch.max(temp_scores, dim=1) for i in range(len(index)): idx = index[i] + config.template_num * i tid = int(tp_idx[idx]) template.append(template_txt[tid]) # valid_losses /= dev_it_num accuracy = rouge.get_scores(template, dev_title, avg=True)['rouge-2']['f'] logger.info(f'epcohs {epoch} dev rouge-2 f {accuracy}') # early stop if accuracy > min_loss: patience = 0 min_loss = accuracy fn = os.path.join(config.save_dir, f"model_final.pkl") torch.save(model.state_dict(), fn) else: print(f"patience is {patience}") patience += 1 if patience > config.early_stop: logger.info( 'early stop because val loss is continue increasing!') end_time = time.time() logger.info(f"total training time {end_time - start_time}") exit() # valid_losses = 0 fn = os.path.join(config.save_dir, "model_final.pkl") torch.save(model.state_dict(), fn)
class MAML: def __init__(self, device, **kwargs): self.inner_lr = kwargs.get('inner_lr') self.meta_lr = kwargs.get('meta_lr') self.write_prob = kwargs.get('write_prob') self.replay_rate = kwargs.get('replay_rate') self.replay_every = kwargs.get('replay_every') self.device = device self.pn = TransformerClsModel(model_name=kwargs.get('model'), n_classes=1, max_length=kwargs.get('max_length'), device=device) logger.info('Loaded {} as PN'.format(self.pn.__class__.__name__)) meta_params = [p for p in self.pn.parameters() if p.requires_grad] self.meta_optimizer = AdamW(meta_params, lr=self.meta_lr) self.memory = ReplayMemory(write_prob=self.write_prob, tuple_size=3) self.loss_fn = nn.BCEWithLogitsLoss() inner_params = [p for p in self.pn.parameters() if p.requires_grad] self.inner_optimizer = optim.SGD(inner_params, lr=self.inner_lr) def save_model(self, model_path): checkpoint = self.pn.state_dict() torch.save(checkpoint, model_path) def load_model(self, model_path): checkpoint = torch.load(model_path) self.pn.load_state_dict(checkpoint) def evaluate(self, dataloader, updates, mini_batch_size): self.pn.train() support_set = [] for _ in range(updates): text, label, candidates = self.memory.read_batch(batch_size=mini_batch_size) support_set.append((text, label, candidates)) with higher.innerloop_ctx(self.pn, self.inner_optimizer, copy_initial_weights=False, track_higher_grads=False) as (fpn, diffopt): # Inner loop task_predictions, task_labels = [], [] support_loss = [] for text, label, candidates in support_set: replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text, label, candidates) input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations))) output = fpn(input_dict) targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device) loss = self.loss_fn(output, targets) diffopt.step(loss) pred, true_labels = models.utils.make_rel_prediction(output, ranking_label) support_loss.append(loss.item()) task_predictions.extend(pred.tolist()) task_labels.extend(true_labels.tolist()) acc = models.utils.calculate_accuracy(task_predictions, task_labels) logger.info('Support set metrics: Loss = {:.4f}, accuracy = {:.4f}'.format(np.mean(support_loss), acc)) all_losses, all_predictions, all_labels = [], [], [] for text, label, candidates in dataloader: replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text, label, candidates) with torch.no_grad(): input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations))) output = fpn(input_dict) targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device) loss = self.loss_fn(output, targets) loss = loss.item() pred, true_labels = models.utils.make_rel_prediction(output, ranking_label) all_losses.append(loss) all_predictions.extend(pred.tolist()) all_labels.extend(true_labels.tolist()) acc = models.utils.calculate_accuracy(all_predictions, all_labels) logger.info('Test metrics: Loss = {:.4f}, accuracy = {:.4f}'.format(np.mean(all_losses), acc)) return acc def training(self, train_datasets, **kwargs): updates = kwargs.get('updates') mini_batch_size = kwargs.get('mini_batch_size') if self.replay_rate != 0: replay_batch_freq = self.replay_every // mini_batch_size replay_freq = int(math.ceil((replay_batch_freq + 1) / (updates + 1))) replay_steps = int(self.replay_every * self.replay_rate / mini_batch_size) else: replay_freq = 0 replay_steps = 0 logger.info('Replay frequency: {}'.format(replay_freq)) logger.info('Replay steps: {}'.format(replay_steps)) concat_dataset = data.ConcatDataset(train_datasets) train_dataloader = iter(data.DataLoader(concat_dataset, batch_size=mini_batch_size, shuffle=False, collate_fn=datasets.utils.rel_encode)) episode_id = 0 while True: self.inner_optimizer.zero_grad() support_loss, support_acc = [], [] with higher.innerloop_ctx(self.pn, self.inner_optimizer, copy_initial_weights=False, track_higher_grads=False) as (fpn, diffopt): # Inner loop support_set = [] task_predictions, task_labels = [], [] for _ in range(updates): try: text, label, candidates = next(train_dataloader) support_set.append((text, label, candidates)) except StopIteration: logger.info('Terminating training as all the data is seen') return for text, label, candidates in support_set: replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text, label, candidates) input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations))) output = fpn(input_dict) targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device) loss = self.loss_fn(output, targets) diffopt.step(loss) pred, true_labels = models.utils.make_rel_prediction(output, ranking_label) support_loss.append(loss.item()) task_predictions.extend(pred.tolist()) task_labels.extend(true_labels.tolist()) self.memory.write_batch(text, label, candidates) acc = models.utils.calculate_accuracy(task_predictions, task_labels) logger.info('Episode {} support set: Loss = {:.4f}, accuracy = {:.4f}'.format(episode_id + 1, np.mean(support_loss), acc)) # Outer loop query_loss, query_acc = [], [] query_set = [] if self.replay_rate != 0 and (episode_id + 1) % replay_freq == 0: for _ in range(replay_steps): text, label, candidates = self.memory.read_batch(batch_size=mini_batch_size) query_set.append((text, label, candidates)) else: try: text, label, candidates = next(train_dataloader) query_set.append((text, label, candidates)) self.memory.write_batch(text, label, candidates) except StopIteration: logger.info('Terminating training as all the data is seen') return for text, label, candidates in query_set: replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text, label, candidates) input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations))) output = fpn(input_dict) targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device) loss = self.loss_fn(output, targets) query_loss.append(loss.item()) pred, true_labels = models.utils.make_rel_prediction(output, ranking_label) acc = models.utils.calculate_accuracy(pred.tolist(), true_labels.tolist()) query_acc.append(acc) # PN meta gradients pn_params = [p for p in fpn.parameters() if p.requires_grad] meta_pn_grads = torch.autograd.grad(loss, pn_params) pn_params = [p for p in self.pn.parameters() if p.requires_grad] for param, meta_grad in zip(pn_params, meta_pn_grads): if param.grad is not None: param.grad += meta_grad.detach() else: param.grad = meta_grad.detach() # Meta optimizer step self.meta_optimizer.step() self.meta_optimizer.zero_grad() logger.info('Episode {} query set: Loss = {:.4f}, accuracy = {:.4f}'.format(episode_id + 1, np.mean(query_loss), np.mean(query_acc))) episode_id += 1 def testing(self, test_dataset, **kwargs): updates = kwargs.get('updates') mini_batch_size = kwargs.get('mini_batch_size') test_dataloader = data.DataLoader(test_dataset, batch_size=mini_batch_size, shuffle=False, collate_fn=datasets.utils.rel_encode) acc = self.evaluate(dataloader=test_dataloader, updates=updates, mini_batch_size=mini_batch_size) logger.info('Overall test metrics: Accuracy = {:.4f}'.format(acc)) return acc