def train(train_iter, val_iter, model): opt = AdamW(model.parameters(), lr=1e-4, eps=1e-8) scheduler = WarmupLinearSchedule(opt, warmup_steps=20, t_total=2500) model.train() losses = [] for i, ex in enumerate(train_iter): opt.zero_grad() words, mapper, _ = ex.word label, lengths = ex.head batch, _ = label.shape # Model final = model(words.cuda(), mapper) for b in range(batch): final[b, lengths[b]-1:, :] = 0 final[b, :, lengths[b]-1:] = 0 if not lengths.max() <= final.shape[1] + 1: print("fail") continue dist = DependencyCRF(final, lengths=lengths) labels = dist.struct.to_parts(label, lengths=lengths).type_as(final) log_prob = dist.log_prob(labels) loss = log_prob.sum() (-loss).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) opt.step() scheduler.step() losses.append(loss.detach()) if i % 50 == 1: print(-torch.tensor(losses).mean(), words.shape) losses = [] if i % 600 == 500: validate(val_iter)
def train(self): ''' train start here. ''' optimizer = AdamW( [p for p in self.model.parameters() if p.requires_grad], lr=1e-5) for round_num in range(0, self.ROUND): # do valid per round print(f'**** now round {round_num} valid begin:') self._eval(self.valid_loader) # do train for step, batch in enumerate(self.train_loader): self.model.train() deep_apply_dict(batch, lambda _, v: v.to(self.DEVICE)) y = batch.pop('y').view(-1) res = self.model.forward(**batch) res = res.view(-1, res.size(-1)) loss = F.cross_entropy(res, y) print( f'[round: {round_num}]: {step}/{len(self.train_loader)} end. loss: {loss}' ) loss.backward() optimizer.step() optimizer.zero_grad()
def main(): parser = argparse.ArgumentParser(description='openGPT-2 analysis') parser.add_argument( '--mode', choices=['train', 'eval-singletoken', 'eval-completion', 'eval-both'], default='eval-singletoken') parser.add_argument('--eval-split', choices=['train', 'valid', 'test']) parser.add_argument('--model-name', choices=['gpt2', 'gpt2-medium', 'gpt2-large'], default='gpt2-medium') parser.add_argument('--model-load-dir', type=str, default=None) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--data-base', type=str) parser.add_argument('--num-train-epochs', type=int, default=1) parser.add_argument('--batch-size-singletoken', type=int, default=1024) parser.add_argument('--batch-size-completion', type=int, default=300) parser.add_argument( "--output-dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # eval-completion parser.add_argument('--prefix-length', type=int, default=50) parser.add_argument('--continuation-length', type=int, default=100) parser.add_argument('--top-k', type=int, default=1) parser.add_argument('--top-p', type=float, default=0.0) # custom training parser.add_argument('--sequence-tune-rate', type=float, default=0.5) parser.add_argument('--train-batch-size', type=int, default=300) parser.add_argument('--report-metrics-every', type=int, default=10) parser.add_argument('--save-every', type=int, default=1000) parser.add_argument('--sequence-ngram-n', type=int, default=4) parser.add_argument('--train-n-steps', type=int, default=10000) parser.add_argument('--validate-every', type=int, default=10000) # training loop parser.add_argument("--adam-epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max-grad-norm', type=int, default=1) parser.add_argument("--max-steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient-accumulation-steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning-rate', type=float, default=6.25e-5) parser.add_argument("--warmup-steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr-schedule', type=str, default='warmup_linear') parser.add_argument('--weight-decay', type=float, default=0.01) parser.add_argument('--lm-coef', type=float, default=0.9) args = parser.parse_args() print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') dataset_paths = { 'train': os.path.join(args.data_base, 'train_tokens_bpe_gpt2.pt'), 'valid': os.path.join(args.data_base, 'valid_tokens_bpe_gpt2.pt'), 'test': os.path.join(args.data_base, 'test_tokens_bpe_gpt2.pt'), } if args.model_load_dir: model = GPT2LMHeadModel.from_pretrained(args.model_load_dir) else: model = GPT2LMHeadModel.from_pretrained(args.model_name) model.to(device) if args.mode == 'eval-singletoken' or args.mode == 'eval-both': eval_singletoken(model, args, dataset_paths) if args.mode == 'eval-completion' or args.mode == 'eval-both': datasets = get_datasets(dataset_paths, max_len=args.batch_size_completion) eval_sampler = SequentialSampler(datasets[args.eval_split]) eval_dataloader = DataLoader(datasets[args.eval_split], sampler=eval_sampler, batch_size=1) model.eval() with torch.no_grad(): all_text_completions = [] bpe_ngram_metrics = Metrics(pad=-1) word_ngram_metrics = Metrics(pad=-1) for i, batch in tqdm(enumerate(eval_dataloader), desc="Evaluating", total=len(eval_dataloader)): input_sequence = batch[0].cuda() if input_sequence.size(1) < args.prefix_length: continue # Predict the completions. batch = batch_input_sequence_by_prefix_length( input_sequence, args.prefix_length) bpe_completions, _ = sample_sequence(model, batch, args.prefix_length, args.continuation_length, args.top_k, args.top_p) bpe_completions = bpe_completions.tolist() # Extract continuations from the predicted completions. bpe_continuations = [] text_continuations = [] for bpe_completion in bpe_completions: bpe_continuations.append( bpe_completion[args.prefix_length:]) text_continuations.append( get_text_continuation(bpe_completion, tokenizer, args)) all_text_completions.append( tokenizer.decode(bpe_completion)) # Only keep continuations with at least one 4-gram # (A short continuation may occur due to predicted whitespace, then tokenizing, despite being # normal length in BPE tokens). text_continuations = [ c for c in text_continuations if len(c) > 3 ] # Update metrics with this batch of continuations. bpe_ngram_metrics.update(bpe_continuations) word_ngram_metrics.update(text_continuations) # Save the (possibly intermediate) metrics. save_completion_metrics(bpe_metrics=bpe_ngram_metrics.report( 'bpe_%s' % args.eval_split), word_metrics=word_ngram_metrics.report( 'word_%s' % args.eval_split), text_completions=all_text_completions, config=model.config.to_dict(), args=args) if args.mode == 'train': if not os.path.exists(os.path.join(args.output_dir, 'best')): os.makedirs(os.path.join(args.output_dir, 'best')) token_loss = mle_loss datasets = get_datasets(dataset_paths, max_len=args.train_batch_size) train_sampler = RandomSampler(datasets['train']) train_seq_dataloader = DataLoader(datasets['train'], sampler=train_sampler, batch_size=1) # Setup optimizer if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len( train_seq_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_seq_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) total_steps = 0 best_ppl = 1e20 for _ in trange(args.num_train_epochs, desc="Epoch"): logging_outputs = [] epoch_loss = 0 epoch_steps = 0 tqdm_bar = tqdm(train_seq_dataloader, desc="Training", total=args.train_n_steps) for step, batch in enumerate(tqdm_bar): optimizer.zero_grad() # Sequence loss if torch.rand(1).item() < args.sequence_tune_rate: if batch[0].size(1) < args.prefix_length: continue loss, batch_metrics = ul_seq(model, batch, args) # Token loss else: loss, batch_metrics = token_loss(model, batch, args) loss.backward() optimizer.step() scheduler.step() epoch_loss += loss.item() epoch_steps += 1 total_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( epoch_loss / epoch_steps, scheduler.get_lr()[0]) logging_outputs.append(batch_metrics) if epoch_steps % args.report_metrics_every == 0: logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs( logging_outputs) temp = SequencePenaltyCriterion.aggregate_logging_outputs( logging_outputs) for k, v in temp.items(): logging_average[k] = v logging_average['ppl'] = 2**logging_average['loss'] print(logging_average) logging_outputs = [] if step == args.train_n_steps: break if epoch_steps % args.save_every == 0: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) if total_steps % args.validate_every == 0: print("Validating...") validation_outputs = eval_singletoken( model, args, dataset_paths, train_iter=total_steps) if validation_outputs['ppl'] < best_ppl: best_ppl = validation_outputs['ppl'] model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, 'best', WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, 'best', CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary( os.path.join(args.output_dir, 'best')) save_singletoken_metrics(validation_outputs, model.config.to_dict(), args, train_iter=total_steps, best=True)
def train(self, model, model_name, B, N_for_train, N_for_eval, K, Q, na_rate=0, learning_rate=1e-1, lr_step_size=20000, weight_decay=1e-5, train_iter=30000, val_iter=1000, val_step=2000, test_iter=3000, load_ckpt=None, save_ckpt=None, pytorch_optim=optim.SGD, bert_optim=False, warmup=True, warmup_step=300, grad_iter=1, fp16=False, pair=False, adv_dis_lr=1e-1, adv_enc_lr=1e-1): ''' model: a FewShotREModel instance model_name: Name of the model B: Batch size N: Num of classes for each batch K: Num of instances for each class in the support set Q: Num of instances for each class in the query set ckpt_dir: Directory of checkpoints learning_rate: Initial learning rate lr_step_size: Decay learning rate every lr_step_size steps weight_decay: Rate of decaying weight train_iter: Num of iterations of training val_iter: Num of iterations of validating val_step: Validate every val_step steps test_iter: Num of iterations of testing ''' print("Start training...") # Init if bert_optim: print('Use bert optim!') parameters_to_optimize = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] parameters_to_optimize = [ {'params': [p for n, p in parameters_to_optimize if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in parameters_to_optimize if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(parameters_to_optimize, lr=2e-5, correct_bias=False) if self.adv: optimizer_encoder = AdamW(parameters_to_optimize, lr=1e-5, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=train_iter) else: optimizer = pytorch_optim(model.parameters(), learning_rate, weight_decay=weight_decay) if self.adv: optimizer_encoder = pytorch_optim(model.parameters(), lr=adv_enc_lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=lr_step_size) if self.adv: optimizer_dis = pytorch_optim(self.d.parameters(), lr=adv_dis_lr) if load_ckpt: state_dict = self.__load_model__(load_ckpt)['state_dict'] own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) start_iter = 0 else: start_iter = 0 if fp16: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model.train() if self.adv: self.d.train() # Training best_acc = 0 not_best_count = 0 # Stop training after several epochs without improvement. iter_loss = 0.0 iter_loss_dis = 0.0 iter_right = 0.0 iter_right_dis = 0.0 iter_sample = 0.0 for it in range(start_iter, start_iter + train_iter): if pair: batch, label = next(self.train_data_loader) if torch.cuda.is_available(): for k in batch: batch[k] = batch[k].cuda() label = label.cuda() logits, pred = model(batch, N_for_train, K, Q * N_for_train + na_rate * Q) else: support, query, label = next(self.train_data_loader) if torch.cuda.is_available(): for k in support: support[k] = support[k].cuda() for k in query: query[k] = query[k].cuda() label = label.cuda() logits, pred = model(support, query, N_for_train, K, Q * N_for_train + na_rate * Q) loss = model.loss(logits, label) / float(grad_iter) right = model.accuracy(pred, label) if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 10) else: loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), 10) if it % grad_iter == 0: optimizer.step() scheduler.step() optimizer.zero_grad() # Adv part if self.adv: support_adv = next(self.adv_data_loader) if torch.cuda.is_available(): for k in support_adv: support_adv[k] = support_adv[k].cuda() features_ori = model.sentence_encoder(support) features_adv = model.sentence_encoder(support_adv) features = torch.cat([features_ori, features_adv], 0) total = features.size(0) dis_labels = torch.cat([torch.zeros((total//2)).long().cuda(), torch.ones((total//2)).long().cuda()], 0) dis_logits = self.d(features) loss_dis = self.adv_cost(dis_logits, dis_labels) _, pred = dis_logits.max(-1) right_dis = float((pred == dis_labels).long().sum()) / float(total) loss_dis.backward(retain_graph=True) optimizer_dis.step() optimizer_dis.zero_grad() optimizer_encoder.zero_grad() loss_encoder = self.adv_cost(dis_logits, 1 - dis_labels) loss_encoder.backward(retain_graph=True) optimizer_encoder.step() optimizer_dis.zero_grad() optimizer_encoder.zero_grad() iter_loss_dis += self.item(loss_dis.data) iter_right_dis += right_dis iter_loss += self.item(loss.data) iter_right += self.item(right.data) iter_sample += 1 if self.adv: sys.stdout.write('step: {0:4} | loss: {1:2.6f}, accuracy: {2:3.2f}%, dis_loss: {3:2.6f}, dis_acc: {4:2.6f}' .format(it + 1, iter_loss / iter_sample, 100 * iter_right / iter_sample, iter_loss_dis / iter_sample, 100 * iter_right_dis / iter_sample) +'\r') else: sys.stdout.write('step: {0:4} | loss: {1:2.6f}, accuracy: {2:3.2f}%'.format(it + 1, iter_loss / iter_sample, 100 * iter_right / iter_sample) +'\r') sys.stdout.flush() if (it + 1) % val_step == 0: acc = self.eval(model, B, N_for_eval, K, Q, val_iter, na_rate=na_rate, pair=pair) model.train() if acc > best_acc: print('Best checkpoint') torch.save({'state_dict': model.state_dict()}, save_ckpt) best_acc = acc iter_loss = 0. iter_loss_dis = 0. iter_right = 0. iter_right_dis = 0. iter_sample = 0. print("\n####################\n") print("Finish training " + model_name)
class Trainer(object): def __init__(self, args): self.args = args self.train_batch_num = args['train_batch'] self.Dataloader = Dataloader(args) print("preparing the train_data") self.train_data = self.Dataloader.load_train_batches() print("preparing the val_data") print("train data len:", len(self.train_data) * self.train_batch_num) self.cuda_gpu = (torch.cuda.is_available() and args['use_gpu']) print("build modeling:") self.global_model = Global_Model(args) if (self.cuda_gpu): # torch.nn.DataParallel (self.global_model, device_ids=gpus).cuda () self.global_model = self.global_model.cuda() self.global_optimer = AdamW(self.global_model.parameters(), lr=args['global_lr']) num_total_steps = len(self.train_data) * args['global_epoch'] num_warmup_steps = int(args['global_warmup_rate'] * num_total_steps) self.global_scheduler = WarmupLinearSchedule( self.global_optimer, warmup_steps=num_warmup_steps, t_total=num_total_steps) def train_global(self): epoches = self.args['global_epoch'] times = time() #self.writeglobal_features () max_acc = 0 for epoch in range(epoches): train_datas = self.Dataloader.load_train_batches() loss_list = [] acc_list = [] times = time() for batch in range(len(train_datas)): text_feature = torch.tensor(train_datas[batch]["text"]) text_pos_feature = torch.tensor(train_datas[batch]["pos_text"]) text_neg_feature = torch.tensor(train_datas[batch]["neg_text"]) if self.cuda_gpu: text_feature = text_feature.cuda() text_pos_feature = text_pos_feature.cuda() text_neg_feature = text_neg_feature.cuda() text_emb = self.global_model(text_feature) pos_emb = self.global_model(text_pos_feature) neg_emb = self.global_model(text_neg_feature) #print(text_emb.shape) pos_dis, neg_dis = euclidean_distance( text_emb, pos_emb), euclidean_distance(text_emb, neg_emb) pos_origin_dis, neg_origin_dis = torch.cosine_similarity( text_emb, pos_emb, dim=1), torch.cosine_similarity(text_emb, neg_emb, dim=1) #pos_dis,neg_dis = torch.cosine_similarity(text_emb,pos_emb,dim=1),torch.cosine_similarity(text_emb,neg_emb,dim=1) mean_pos_dis, mean_neg_dis = torch.mean(pos_origin_dis).detach( ).numpy(), torch.mean(neg_origin_dis).detach().numpy() acc = torch.mean(dis_acc(pos_dis, neg_dis)).detach().numpy() acc_list.append(acc) loss = triplet_loss(pos_dis, neg_dis) loss_np = loss if self.cuda_gpu: loss_np = loss_np.cpu() loss_np = loss_np.detach().numpy() loss_list.append(loss_np) self.global_optimer.zero_grad() loss.backward() self.global_optimer.step() self.global_scheduler.step() if batch % 200 == 0: print( "batch: %d loss:%.4f acc:%.4f pos_dis:%f neg_dis:%f " % (batch, loss_np, acc, mean_pos_dis, mean_neg_dis)) mean_acc = np.mean(acc_list) print("epoch:%d loss:%.4f acc:%.4f time:[%.2fs]" % (epoch, np.mean(loss_list), mean_acc, time() - times)) torch.save(self.global_model.state_dict(), self.args['global_model_save_path']) self.writeglobal_features() print("training_complete!") def writeglobal_features(self): print("writing_train_features:") train_feature_data = self.Dataloader.train_features val_feature = self.Dataloader.val_features test_feauture = self.Dataloader.test_features train_out_path = self.args['feature_global_train_path'] global_feature_dict = {} for keys in train_feature_data: input_tensor = np.array(train_feature_data[keys].values) input_tensor = torch.tensor([input_tensor]) output_tensor = self.global_model(input_tensor) output_np = (output_tensor[0]).detach().numpy() global_feature_dict[keys] = output_np pd.to_pickle(global_feature_dict, train_out_path) val_out_path = self.args['feature_global_val_path'] val_feature_dict = {} for keys in val_feature: input_tensor = np.array(val_feature[keys].values) input_tensor = torch.tensor([input_tensor]) output_tensor = self.global_model(input_tensor) output_np = (output_tensor[0]).detach().numpy() val_feature_dict[keys] = output_np pd.to_pickle(val_feature_dict, val_out_path) test_out_path = self.args['feature_global_test_path'] test_feature_dict = {} for keys in test_feauture: input_tensor = np.array(test_feauture[keys].values) input_tensor = torch.tensor([input_tensor]) output_tensor = self.global_model(input_tensor) output_np = (output_tensor[0]).detach().numpy() test_feature_dict[keys] = output_np pd.to_pickle(test_feature_dict, test_out_path)
def train(args, train_iter, dev, test, src_field, tgt_field, tag_field, checkpoint): # srcpadid = src_field.vocab.stoi['<pad>'] tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') model = Classify_Extractor(args, tgt_field) if torch.cuda.is_available(): model.cuda() print_params(model) decay = args.decay if args.optimizer == 'bert': weight_decay = 0.0 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] opt = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8) totalnum = 0 for i in train_iter: totalnum += 1 #print(args.lr) #print(args.maximum_steps) #exit() t_total = totalnum // decay * args.maximum_steps scheduler = WarmupLinearSchedule(opt, warmup_steps=0, t_total=t_total) else: opt = torch.optim.Adadelta(model.parameters(), lr=args.lr) best_e = 0.0 best_c = 0.0 best_epoch_for_c = 0 best_epoch_for_e = 0 offset = 0.0 pre_epoch = 0 patience_c = 0 patience_e = 0 if checkpoint is not None: print('model.load_state_dict(checkpoint[model])') model.load_state_dict(checkpoint['model']) if args.resume: opt.load_state_dict(checkpoint['optim']) best_f = checkpoint['f'] offset = checkpoint['iters'] pre_epoch = checkpoint['epoch'] print('*************************************') print('resume from {} epoch {} iters and best_f {}'.format( pre_epoch, offset, best_f)) print('*************************************') print("**************start training****************") start = time.time() for epoch in range(args.maxepoch): train_iter.init_epoch() epoch += pre_epoch for iters, train_batch in enumerate(train_iter): iters += offset model.train() # model.zero_grad() # model.constrain_transition() t1 = time.time() batch_src = train_batch.src #print(batch_src) #exit() src = [tokenizer.convert_tokens_to_ids(s) for s in batch_src] maxlen = max([len(s) for s in batch_src]) src_mask = [] padded_sents = [] for s in src: new_s = s + [0] * (maxlen - len(s)) padded_sents.append(new_s) mask = [1] * len(s) + [0] * (maxlen - len(s)) src_mask.append(mask) # B T src = torch.tensor(padded_sents).long().cuda() # B T src_mask = torch.tensor(src_mask).byte().cuda() # src, src_mask = prepare_src(train_batch.src, srcpadid) tgt = prepare_tgt(train_batch.tgt) tag = train_batch.tag loss = model(src, src_mask, tgt, tag) # "update parameters" if decay > 1: loss = loss / decay loss.backward() # if args.grad_clip: # torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) if (iters + 1) % decay == 0: opt.step() scheduler.step() # Update learning rate schedule opt.zero_grad() # opt.step() t2 = time.time() loss = loss.item() print("epoch:{} iters:{} src:({},{}) tgt:({},{}) " "loss:{:.2f} t:{:.2f}".format(epoch + 1, iters + 1, *src.size(), *tgt.size(), loss, t2 - t1)) # if torch.cuda.is_available(): # torch.cuda.empty_cache() if (epoch + 1) % 1 == 0: print("=============validate model==============") with torch.no_grad(): dev.init_epoch() model.eval() # model.constrain_transition() sents = [] cy_true = [] cy_pred = [] for j, dev_batch in enumerate(dev): t1 = time.time() # src, src_mask = prepare_src(dev_batch.src, srcpadid) batch_src = dev_batch.src src = [ tokenizer.convert_tokens_to_ids(s) for s in batch_src ] maxlen = max([len(s) for s in batch_src]) src_mask = [] padded_sents = [] for s in src: new_s = s + [0] * (maxlen - len(s)) padded_sents.append(new_s) mask = [1] * len(s) + [0] * (maxlen - len(s)) src_mask.append(mask) # B T src = torch.tensor(padded_sents).long().cuda() # B T src_mask = torch.tensor(src_mask).byte().cuda() tgt = prepare_tgt(dev_batch.tgt) tag = dev_batch.tag.squeeze(-1) _, pre_tag = model.component_extraction(src, src_mask) pre_ctag = model.simile_classify(src, src_mask) cy_true.extend(tag.tolist()) cy_pred.extend(pre_ctag.tolist()) for sen, tags, p_tags, c_tags in zip( src, tgt, pre_tag, tag): sen = sen[:len(p_tags)].tolist() tags = tags[:len(p_tags)].tolist() if c_tags == 1: sents.append([ sen, [tgt_field.vocab.itos[t] for t in tags], [tgt_field.vocab.itos[t] for t in p_tags] ]) print('dev iters: {}, t:{}'.format(j, time.time() - t1)) _, eprecision, erecall, ef1 = evaluate(sents) cprecision = precision_score(cy_true, cy_pred) crecall = recall_score(cy_true, cy_pred) cf1 = f1_score(cy_true, cy_pred) print( 'epoch: {} classify--> precision: {} recall: {} f1: {} best:{}' .format(epoch + 1, cprecision, crecall, cf1, best_c)) print('extractor--> precision: {} recall: {} f1: {} best: {}'. format(eprecision, erecall, ef1, best_e)) if cf1 > best_c: best_c = cf1 best_epoch_for_c = epoch + 1 print( 'save best classifier model at epoch={}'.format(epoch + 1)) checkpoint = { 'model': model.state_dict(), 'optim': opt.state_dict(), 'args': args } torch.save( checkpoint, '{}/{}.classify.best.pt'.format( args.model_path, args.model)) patience_c = 0 else: patience_c += 1 if ef1 > best_e: best_e = ef1 best_epoch_for_e = epoch + 1 print( 'save best extractor model at epoch={}'.format(epoch + 1)) checkpoint = { 'model': model.state_dict(), 'optim': opt.state_dict(), 'args': args } torch.save( checkpoint, '{}/{}.extractor.best.pt'.format( args.model_path, args.model)) patience_e = 0 else: patience_e += 1 if patience_c > args.patience and patience_e > args.patience: print("early stop at {}".format(epoch)) break if args.decay: opt.param_groups[0]['lr'] = opt.param_groups[0]['lr'] * args.decay print('*******Done********{}'.format( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) minutes = (time.time() - start) // 60 if minutes < 60: print( 'best_c:{}, best_e:{} best_epoch_c:{}, best_epoch_e:{}, time:{} mins' .format(best_c, best_e, best_epoch_for_c, best_epoch_for_e, minutes)) else: hours = minutes / 60 print( 'best_c:{}, best_e:{} best_epoch_c:{}, best_epoch_e:{}, time:{:.1f} hours' .format(best_c, best_e, best_epoch_for_c, best_epoch_for_e, hours)) print('*******Testing************') model1 = Classify_Extractor(args, tgt_field) model1.cuda() load_from = '{}/{}.classify.best.pt'.format(args.model_path, args.model) print('load the best model {}'.format(load_from)) checkpoint = torch.load(load_from, map_location='cpu') print('load parameters') model1.load_state_dict(checkpoint['model']) model2 = Classify_Extractor(args, tgt_field) model2.cuda() load_from = '{}/{}.extractor.best.pt'.format(args.model_path, args.model) print('load the best model {}'.format(load_from)) checkpoint = torch.load(load_from, map_location='cpu') print('load parameters') model2.load_state_dict(checkpoint['model']) with torch.no_grad(): test.init_epoch() model1.eval() model2.eval() sents = [] cy_true = [] cy_pred = [] for j, test_batch in enumerate(test): t1 = time.time() # src, src_mask = prepare_src(test_batch.src, srcpadid) batch_src = test_batch.src src = [tokenizer.convert_tokens_to_ids(s) for s in batch_src] maxlen = max([len(s) for s in batch_src]) src_mask = [] padded_sents = [] for s in src: new_s = s + [0] * (maxlen - len(s)) padded_sents.append(new_s) mask = [1] * len(s) + [0] * (maxlen - len(s)) src_mask.append(mask) # B T src = torch.tensor(padded_sents).long().cuda() # B T src_mask = torch.tensor(src_mask).byte().cuda() tgt = prepare_tgt(test_batch.tgt) tag = test_batch.tag.squeeze(-1) _, pre_tag = model2.component_extraction(src, src_mask) pre_ctag = model1.simile_classify(src, src_mask) cy_true.extend(tag.tolist()) cy_pred.extend(pre_ctag.tolist()) # for sen, tags, p_tags in zip(src, tgt, pre_tag): # sen = sen[:len(p_tags)].tolist() # tags = tags[:len(p_tags)].tolist() # sents.append([sen, [tgt_field.vocab.itos[t] for t in tags], # [tgt_field.vocab.itos[t] for t in p_tags]]) for sen, tags, p_tags, c_tags in zip(src, tgt, pre_tag, pre_ctag): sen = sen[:len(p_tags)].tolist() tags = tags[:len(p_tags)].tolist() if c_tags == 1: sents.append([ sen, [tgt_field.vocab.itos[t] for t in tags], [tgt_field.vocab.itos[t] for t in p_tags] ]) elif c_tags == 0: sents.append([ sen, [tgt_field.vocab.itos[t] for t in tags], ['O' for t in p_tags] ]) print('test iters: {}, t:{}'.format(j, time.time() - t1)) _, eprecision, erecall, ef1 = evaluate(sents) cprecision = precision_score(cy_true, cy_pred) crecall = recall_score(cy_true, cy_pred) cf1 = f1_score(cy_true, cy_pred) print('Testing classify--> precision: {} recall: {} f1: {}'.format( cprecision, crecall, cf1)) print('extractor--> precision: {} recall: {} f1: {}'.format( eprecision, erecall, ef1))
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( ) num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, response_mask=response_mask, history_mask=history_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % ( self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] scores = [] ID = [x.guid for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, utterance_mask, response_mask, history_mask, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) utterance_mask = utterance_mask.to(self.device) response_mask = response_mask.to(self.device) history_mask = history_mask.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, response_mask=response_mask, history_mask=history_mask, labels=label_ids) logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, response_mask=response_mask, history_mask=history_mask, ) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyCQA(inference_logits, gold_labels) eval_DOUBAN_MRR, eval_DOUBAN_mrr, eval_DOUBAN_MAP, eval_Precision1 = compute_DOUBAN( ID, scores, gold_labels) r_at_1 = r_at_k(ID, scores, gold_labels, 1) r_at_2 = r_at_k(ID, scores, gold_labels, 2) r_at_5 = r_at_k(ID, scores, gold_labels, 5) # print('eval_mrr',eval_mrr) print('eval_F1', eval_accuracy, 'eval_MRR', eval_DOUBAN_MRR, 'eval_MAP', eval_DOUBAN_MAP, 'eval_Precision1', eval_Precision1, 'r10@1', r_at_1, 'r10@2', r_at_2, 'r10@5', r_at_5, 'global_step', global_step, 'loss', train_loss) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'eval_MRR': eval_DOUBAN_MRR, 'eval_MAP': eval_DOUBAN_MAP, 'eval_Precision1': eval_Precision1, 'r10@1': r_at_1, 'r10@2': r_at_2, 'r10@5': r_at_5, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') # if eval_accuracy > best_acc : if eval_DOUBAN_MRR > best_MRR: print("=" * 80) print("Best MRR", eval_DOUBAN_MRR) print("Saving Model......") # best_acc = eval_accuracy best_MRR = eval_DOUBAN_MRR # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name) tokenizer.add_special_tokens({ 'cls_token': '<CLS>', 'sep_token': '<SEP>', 'pad_token': '<PAD>', 'eos_token': '<EOS>' }) model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name) model.resize_token_embeddings(len(tokenizer)) special_tokens_ids = [ tokenizer.convert_tokens_to_ids(special_token) for special_token in ['<PAD>', '<CLS>', '<SEP>', '<EOS>'] ] model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids, lm_labels, mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) data_splitList = DATACQA.load_data(os.path.join(self.data_dir, 'train.csv'),n_splits=5) for split_index,each_data in enumerate(data_splitList): # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained(self.model_name_or_path, self.args, config=config) model.to(self.device) logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader(each_data) num_train_optimization_steps = self.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] scores = [] questions = [x.text_a for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) # Run prediction for full data model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyCQA(inference_logits, gold_labels) eval_mrr = compute_MRR_CQA(scores,gold_labels,questions) eval_5R20 = compute_5R20(scores,gold_labels,questions) result = {'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'eval_MRR':eval_mrr, 'eval_5R20':eval_5R20, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc : print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr(model,'module') else model output_model_file = os.path.join(self.output_dir, "pytorch_model_{}.bin".format(split_index)) torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) del model gc.collect()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_type", type=str, default="openai-gpt", help="model type: openai-gpt/gpt2/xlnet/...") parser.add_argument("--model_name_or_path", type=str, default="openai-gpt", help="pretrained model path") parser.add_argument("--toy", action="store_true", help="test code") parser.add_argument("--do_train", action="store_true", help="do training") parser.add_argument("--do_eval", action="store_true", help="do evaluation in the end") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--train_dataset", type=str, nargs="+", default=["data/conceptnet/train100k_CN.txt"]) parser.add_argument( "--eval_dataset", type=str, nargs="+", default=["data/conceptnet/dev1_CN.txt", "data/conceptnet/dev2_CN.txt"]) parser.add_argument("--test_dataset", type=str, nargs="+", default=["data/conceptnet/test_CN.txt"]) parser.add_argument( "--add_prefix", action="store_true", help= "add a prefix at the beginning of each input when train with multiple dataset" ) parser.add_argument("--add_separator", action="store_true", help="add <sep> between sub/rel/obj") parser.add_argument("--predict_part", type=str, default="obj", choices=["sub", "rel", "obj", "all"], help="predict which part of the triples") parser.add_argument("--max_e1", type=int, default=10) parser.add_argument("--max_r", type=int, default=5) parser.add_argument("--max_e2", type=int, default=15) parser.add_argument("--seed", type=int, default=123) parser.add_argument("--no_pretrain", action="store_true", help="w/o pretrained parameters initialized") parser.add_argument("--train_batch_size", type=int, default=32) parser.add_argument("--eval_batch_size", type=int, default=16) parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--logging_steps', type=int, default=250) parser.add_argument("--eval_per_steps", type=int, default=500) parser.add_argument("--num_train_epochs", type=int, default=-1) parser.add_argument( "--max_steps", default=100000, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--max_grad_norm", type=int, default=1) parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--warmup_proportion", type=float, default=0.002) parser.add_argument("--lr_schedule", type=str, default="warmup_linear") parser.add_argument("--weight_decay", type=float, default=0.0) parser.add_argument("--adam_epsilon", type=float, default=1e-8) args = parser.parse_args() print(args) assert (args.predict_part == "obj" or args.model_type == "xlnet") set_seed(args.seed) n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) MODEL_CLASSES = { "gpt2": (GPT2LMHeadModel, GPT2Tokenizer, GPT2Config), "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, OpenAIGPTConfig), "xlnet": (XLNetLMHeadModel, XLNetTokenizer, XLNetConfig), } Model, Tokenizer, Config = MODEL_CLASSES[args.model_type] # load pretrained model tokenizer = Tokenizer.from_pretrained(args.model_name_or_path) # add special tokens # TODO: something feels not so right print("\nspecial tokens:", tokenizer.special_tokens_map) if not tokenizer.eos_token: tokenizer.add_special_tokens({"eos_token": "<eos>"}) if not tokenizer.sep_token: tokenizer.add_special_tokens({"sep_token": "<sep>"}) tokenizer.add_tokens(["<from_CN>", "<from_VG>", "<from_FB>"]) if args.no_pretrain: # from scratch config = Config.from_pretrained(args.model_type) model = Model(config) else: model = Model.from_pretrained(args.model_name_or_path) print("vocab size:", len(tokenizer)) model.resize_token_embeddings(len(tokenizer)) # Here is a bug: # the original HuggingFace code only resize LMHead weight but not LMHead bias, it will cause runtime error # here we manually change the size of LMHead bias in a silly way if args.model_type == "xlnet": from torch.nn.parameter import Parameter model.lm_loss.bias = Parameter(torch.Tensor(len(tokenizer))) fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out( model.lm_loss.weight) bound = 1 / math.sqrt(fan_in) torch.nn.init.uniform_(model.lm_loss.bias, -bound, bound) print("weight size:", model.lm_loss.weight.size()) print("bias size:", model.lm_loss.bias.size()) model.to(device) print("\nspecial tokens:", tokenizer.special_tokens_map) # Load and encode the datasets logger.info("Loading datasets ...") def prefix_mapping(filename): if "vg" in filename.lower(): return "<from_VG>" elif "cn" in filename.lower(): return "<from_CN>" elif "fb" in filename.lower(): return "<from_FB>" def rel_lang(filename): if "vg" in filename.lower(): return False elif "cn" in filename.lower(): return True elif "easyfb" in filename.lower(): return False elif "fb" in filename.lower(): return True train_datasets = [ load_comet_dataset( dataset_path=train_dataset, eos_token=tokenizer.eos_token, sep_token=tokenizer.sep_token, rel_lang=rel_lang(train_dataset), toy=args.toy, discard_negative=True, add_sep=args.add_separator, prefix=prefix_mapping(train_dataset) if args.add_prefix else None) for train_dataset in args.train_dataset ] eval_datasets = [ load_comet_dataset( dataset_path=eval_dataset, eos_token=tokenizer.eos_token, sep_token=tokenizer.sep_token, rel_lang=rel_lang(eval_dataset), toy=args.toy, discard_negative=True, add_sep=args.add_separator, prefix=prefix_mapping(eval_dataset) if args.add_prefix else None) for eval_dataset in args.eval_dataset ] test_datasets = [ load_comet_dataset( dataset_path=test_dataset, eos_token=tokenizer.eos_token, sep_token=tokenizer.sep_token, rel_lang=rel_lang(test_dataset), toy=args.toy, discard_negative=True, add_sep=args.add_separator, prefix=prefix_mapping(test_dataset) if args.add_prefix else None) for test_dataset in args.test_dataset ] train_datasets = [ data for train_dataset in train_datasets for data in train_dataset ] eval_datasets = [ data for eval_dataset in eval_datasets for data in eval_dataset ] test_datasets = [ data for test_dataset in test_datasets for data in test_dataset ] datasets = (train_datasets, eval_datasets, test_datasets) logger.info("Encoding datasets ...") encoded_datasets = tokenize_and_encode(datasets, tokenizer) max_e1 = args.max_e1 if not args.add_separator else (args.max_e1 + 1) max_r = args.max_r if not args.add_separator else (args.max_r + 1) max_e2 = args.max_e2 + 1 # always add <eos> best_loss = 1e10 # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, max_e1, max_r, max_e2, predict_part=args.predict_part) train_tensor_dataset, eval_tensor_dataset, test_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1], tensor_datasets[2] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) test_data = TensorDataset(*test_tensor_dataset) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_datasets)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info( " Each Epoch has %d steps, and %d actual steps w/ accumulation", len(train_dataloader), len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Total train batch size (w. accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] print("total steps:", t_total) num_warmup_steps = args.warmup_proportion * t_total optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=t_total) global_steps = 0 tr_loss, logging_loss = 0.0, 0.0 model.train() for cur_epoch_num in range(int(args.num_train_epochs)): print("Epoch:", cur_epoch_num) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) batch_size = len(batch) loss, logits = batch_step(model, args.model_type, batch, args.predict_part, max_e1, max_r, max_e2, args.add_prefix) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_steps += 1 if global_steps % args.logging_steps == 0: loss = (tr_loss - logging_loss) / args.logging_steps PPL = np.exp(loss) if loss < 300 else np.inf print("Step", global_steps, "Training Loss:", loss, "ppl:", PPL) logging_loss = tr_loss if global_steps % args.eval_per_steps == 0: model.eval() # evaluate eval_loss = evaluate(model, args.model_type, args.predict_part, eval_dataloader, tokenizer, max_e1, max_r, max_e2, args.add_prefix) print("\n\nevaluating\neval loss:", eval_loss, "ppl", np.exp(eval_loss) if eval_loss < 300 else np.inf) # decide to save if eval_loss < best_loss: # save save_model(model, tokenizer, args.output_dir) print("model saved at step", global_steps) print(str(datetime.datetime.now())) print("prev loss:", best_loss, "cur loss:", eval_loss) best_loss = eval_loss # test test_loss = evaluate(model, args.model_type, args.predict_part, test_dataloader, tokenizer, max_e1, max_r, max_e2, args.add_prefix) print("\n\ntesting\ntest loss:", test_loss, "ppl:", np.exp(test_loss) if test_loss < 300 else np.inf) model.train() if args.do_eval: model.eval() eval_loss = evaluate(model, args.model_type, args.predict_part, eval_dataloader, tokenizer, max_e1, max_r, max_e2, args.add_prefix) print("\n\nevaluating\neval loss:", eval_loss, "ppl", np.exp(eval_loss) if eval_loss < 300 else np.inf) test_loss = evaluate(model, args.model_type, args.predict_part, test_dataloader, tokenizer, max_e1, max_r, max_e2, args.add_prefix) print("\n\ntesting\ntest loss:", test_loss, "ppl:", np.exp(test_loss) if test_loss < 300 else np.inf)
def train(args, train_dataset, model, tokenizer, mask_generator, training=False, meta_training=True): """ Train the model""" args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) if meta_training: args.save_history = True train_dataset.add_label(mask_generator, tokenizer, args, model) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.n_gpu > 1: model = torch.nn.DataParallel(model, args.task_devices, output_device=0) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 tr_acc, logging_acc = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Pre-Training", disable=args.local_rank not in [-1, 0]) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): if epoch > 0: args.save_history = False else: args.save_history = True if meta_training: inputs, labels = batch else: inputs, labels = mask_generator.mask( batch, tokenizer, args, model=model.module if hasattr(model, 'module') else model) inputs = inputs.to(args.device) labels = labels.to(args.device) input_mask = ~inputs.eq(args.pad_token) model.train() _inputs = { 'input_ids': inputs, 'masked_lm_labels': labels, 'attention_mask': input_mask, } outputs = model(**_inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() optimizer.zero_grad() global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break # print(loss) # print("Global step: {} / Training Loss: {}".format(global_step, tr_loss / global_step)) # print("Global Accuracy: {} / Training Accuracy: {}".format(global_step, tr_acc / global_step)) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break del optimizer del scheduler return global_step, tr_loss / global_step
def Train(inputIds, attention_masks, labels, batch_size=24, epochs=10): train_inputs, validation_inputs, train_labels, validation_labels = train_test_split( inputIds, labels, random_state=2020, test_size=0.2) train_masks, validation_masks, _, _ = train_test_split(attention_masks, inputIds, random_state=2020, test_size=0.2) # Turn data into torch tensors train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks) # Create Iterators of the datasets train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2) # Loads model into GPU memory model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5) # train_loss_set = [] # Find GPU or CPU device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') trainLoss = [] valAcc = [] for _ in trange(epochs, desc='Epoch'): # Train model.train() trainLoss.append(0) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch optimizer.zero_grad() # Forward pass and loss calculation outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs[0] logits = outputs[1] # Calculate gradients loss.backward() # Update weights using gradients optimizer.step() trainLoss[-1] += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print('\nTrain loss: {}'.format(trainLoss[-1] / nb_tr_steps)) # Valuation model.eval() nb_eval_steps = 0 valAcc.append(0) for batch in validation_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch # Don't calculate gradients since we are evaluating the model with torch.no_grad(): output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = output[0] # Grab logistic values from GPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(logits, label_ids) valAcc[-1] += tmp_eval_accuracy nb_eval_steps += 1 print('\nValidation Accuracy: {}\n'.format(valAcc[-1] / nb_eval_steps)) return model, trainLoss, valAcc
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=3) model = BertForSequenceClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) data_splitList = DATABDCI.load_data(os.path.join( self.data_dir, 'train.csv'), n_splits=5) for split_index, each_data in enumerate(data_splitList): logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( each_data) num_train_optimization_steps = self.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and ( step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) # Run prediction for full data model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyBDCI(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join( self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( self.output_dir, "pytorch_model_{}.bin".format(split_index)) torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if self.do_test: del model gc.collect() self.do_train = False data = DATABDCI(debug=False, data_dir='/home/lsy2018/文本匹配/DATA/DATA_BDCI/', data_process_output= '/home/lsy2018/文本匹配/DATA/DATA_BDCI/data_1014/') model = BertForSequenceClassification.from_pretrained( os.path.join(self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = data.read_examples(os.path.join( self.data_dir, file), is_training=False) print('exa', len(eval_examples)) eval_features = data.convert_examples_to_features( eval_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) if flag == 'dev': print(flag, accuracyBDCI(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(self.data_dir, file), names=['id', 'content', 'title', 'label']) predict = np.argmax(logits, axis=1).tolist() print(df.shape[0]) print(len(predict)) df['labelpre'] = predict df[['id', 'labelpre' ]].to_csv(os.path.join(self.output_dir, "sub.csv"), index=False, header=False)
def main(): args = parse_arguments() # ====== Set random seed ========= random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) # ======= Prepare ========== logging.basicConfig(level=logging.INFO) USE_CUDA = torch.cuda.is_available() FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor model, tokenizer = load_model(args) # =============== Load & process data ============== split_size = {'train': 0.85, 'test': 0.1, 'val': 0.05} data_loader, val_loader = get_data(args, split_size=split_size, tokenizer=tokenizer) # ========== Prepare optimizer ============= # the gpt2 model from library has unnamed LM head. LM head's weights are tied to input embedding num_train_optimization_steps = len( data_loader) * args.num_train_epochs // args.train_batch_size param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = construct_grouped_parameters( param_optimizer, args.learning_rate, use_discr=args.use_disc_lr) lm_funcs = get_unfreezing_funcs(optimizer_grouped_parameters, warmup_portion=args.warmup_proportion, total_steps=num_train_optimization_steps, use_unfreezing=args.use_unfreezing) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lm_funcs) # Training print("Start training.") model.train() exp_average_loss = None progress_bar = trange(int(args.num_train_epochs), desc="Epoch", leave=True) min_eval_loss = 100 # large enough number early_terminate_counter = 0 for _ in progress_bar: # for _ in range(int(args.num_train_epochs)): for sample in tqdm(data_loader): # for sample in data_loader: if args.keyword: x, type_x, pos_x, lm_x, x_len, _, keyword_x = sample else: x, type_x, pos_x, lm_x, x_len, _ = sample keyword_x = None input_len = x_len[0] lm_x[:, x_len[0] + 1 + args.first_K_tokens:-1] = -1 loss = model(x, position_ids=pos_x, token_type_ids=type_x, labels=lm_x, key_word=keyword_x, use_keyword=args.keyword)[0] loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) progress_bar.set_description( "Training loss: {}".format(exp_average_loss)) eval_loss = evaluate(model, val_loader, use_keyword=args.keyword) print("Eval loss: {}".format(eval_loss)) # if eval_loss < min_eval_loss: # save the model only when the loss is the smallest if True: early_terminate_counter = 0 min_eval_loss = eval_loss # ==== Save the model ==== # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_dir = '../models/' output_model_file = os.path.join(output_dir + args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir + args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_dir + args.output_dir) else: print("eval loss increasing!") early_terminate_counter += 1 if early_terminate_counter > 5: # if the eval loss does not decrease for 5 epochs, terminate early. return
def train(model, tra_data, dev_data, tra_word_vocab, config): optimizer = AdamW(model.parameters(), lr=config.bert_lr, correct_bias=config.correct_bias, weight_decay=config.weight_decay) tra_word_data_iter = create_batch(tra_data, tra_word_vocab, config.batch_size, config, shuffle=False) dev_word_data_iter = create_batch(dev_data, tra_word_vocab, config.dev_batch_size, config, shuffle=False) random_word_iter = data_split(tra_word_data_iter, config.n_fold) tra_word_data_iter, dev_database = database(random_word_iter, config.k, config) # Get start! global_step = 0 best_acc = 0 best_tra_acc = 0 for epoch in range(0, config.epoch): score = 0 print('\nThe epoch is starting.') epoch_start_time = time.time() batch_iter = 0 batch_num = int(len(tra_word_data_iter)) print('The epoch is :', str(epoch)) if config.use_lr_decay: optimizer = decay_learning_rate(config, optimizer, epoch) print("now word_ga lr is {}".format(optimizer.param_groups[0].get("lr")), '\n') for word_batch in tra_word_data_iter: start_time = time.time() model.train() batch_size = tra_word_data_iter[0][0].size(0) / 2 src_premise_matrix, src_hypothesis_matrix, p_mask, h_mask, tag_matrix = word_batch[0], \ word_batch[1], \ word_batch[2], \ word_batch[3], \ word_batch[4] logit_a, logit_b = model(src_premise_matrix, src_hypothesis_matrix, p_mask, h_mask) loss, correct = tri_loss(logit_a, logit_b, config) loss = loss / config.update_every loss.backward() loss_value = loss.item() accuracy = 100.0 * int(correct) / batch_size during_time = float(time.time() - start_time) print('Step:{}, Epoch:{}, batch_iter:{}, accuracy:{:.4f}({}/{}),' 'time:{:.2f}, loss:{:.6f}'.format(global_step, epoch, batch_iter, accuracy, correct, batch_size, during_time, loss_value)) batch_iter += 1 if batch_iter % config.update_every == 0 or batch_iter == batch_num: if config.clip_max_norm_use: nn.utils.clip_grad_norm_(model.parameters(), max_norm=10) optimizer.step() optimizer.zero_grad() global_step += 1 score += correct if batch_iter % config.test_interval == 0 or batch_iter == batch_num: dev_score = evaluate(model, dev_data, dev_word_data_iter, config) if best_acc < dev_score: print('The best dev is' + str(dev_score)) best_acc = dev_score if os.path.exists(config.save_model_path): torch.save(model.state_dict(), config.bert_model_pkl) else: os.makedirs(config.save_model_path) torch.save(model.state_dict(), config.bert_model_pkl) epoch_time = float(time.time() - epoch_start_time) tra_score = 100.0 * score / len(tra_data) if tra_score > best_tra_acc: best_tra_acc = tra_score print('the best_train score is:{}({}/{})'.format(tra_score, score, len(tra_data))) print("epoch_time is:", epoch_time)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, help="pretrained_model.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_probe", action='store_true', help="Whether to probe the representation we got.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--data_dir', type=str, default= '/home/xiongyi/dataxyz/repos/SemSynLSTM/word_language_model/data/wikitext-2/' ) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) timenow = datetime.datetime.now().strftime("%b%d%H%M") model_option = 'adv' outdir = model_option + timenow args = parser.parse_args( ['--output_dir', outdir, '--do_probe', '--num_train_epochs', '50']) #args = parser.parse_args(['--output_dir', './tmp', '--do_eval', '--model_name', 'gpt2']) print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval and not args.do_probe: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Compute the max input length for the Transformer # Todo: Where is this used? input_length = 128 data_dir = '../SemSynLSTM/word_language_model/data/wikitext-2/' if args.data_dir is None else args.data_dir train_set, val_set, test_set, dictionary, pos_dictionary = load_tokenize_and_batchify( data_dir, input_length) # Prepare inputs tensors and dataloaders train_data = TensorDataset(*train_set) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32) eval_data = TensorDataset(*val_set) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=32) # TODO: Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset #special_tokens = ['_start_', '_delimiter_'] #special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) # TODO: Add config config = GPT2Config(n_positions=input_length, n_ctx=input_length, n_layer=6, n_head=8, n_embd=384) config.vocab_size = dictionary.__len__() config.pos_vocab_size = pos_dictionary.__len__() if args.model_name: model = GPT2LMHeadModel.from_pretrained(args.model_name) else: model = GPT2_adverse(config=config) model.to(device) # TODO: Load and encode the datasets logger.info("Encoding dataset...") # Prepare optimizer if args.do_train: all_param = list(model.named_parameters()) param_optimizer = [(n, p) for n, p in all_param if 'pos_head_adv' not in n] param_optimizer_adv = [(n, p) for n, p in all_param if 'pos_head_adv' in n] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer_adv_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer_adv if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer_adv if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_dataloader) * args.num_train_epochs optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, #max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) #t_total=num_train_optimization_steps) optimizer_adv = AdamW( optimizer_adv_grouped_parameters, lr=args.learning_rate, #max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) if args.do_train: train_results = {} nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None for epoch in trange(int(args.num_train_epochs), desc="Epoch"): ###eval on eval set model.eval() nb_eval_steps, nb_eval_examples = 0, 0 perp = 0 average_loss = np.asanyarray([0, 0, 0, 0], dtype='float') for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch with torch.no_grad(): #breakpoint() loss = model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[0].detach().cpu().numpy() loss_syn = model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[1].detach().cpu().numpy() loss_sem = model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[2].detach().cpu().numpy() loss_lm = model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[3].detach().cpu().numpy() perp_batch = np.exp(loss_lm) perp += perp_batch average_loss += np.asanyarray( [loss, loss_syn, loss_sem, loss_lm]) nb_eval_steps += 1 perp /= nb_eval_steps average_loss /= nb_eval_steps print('loss,loss_syn,loss_sem,loss_lm', average_loss, 'perp ', perp, 'epoch ', epoch) train_results[epoch] = (perp, average_loss) model.train() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch loss = model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[0] loss_lm = model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[3] loss_sem = model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[2] #breakpoint() #loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() loss_sem.backward() optimizer_adv.step() optimizer_adv.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} sem: {:.2e} lm: {:.2e}".format( exp_average_loss, loss_sem.item(), loss_lm.item()) print(train_results) # Save a trained model if args.do_train: all_param = list(model.named_parameters()) param_optimizer = [(n, p) for n, p in all_param if 'pos_head_adv' not in n] param_optimizer_adv = [(n, p) for n, p in all_param if 'pos_head_adv' in n] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer_adv_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer_adv if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer_adv if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_dataloader) * args.num_train_epochs optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, #max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) #t_total=num_train_optimization_steps) optimizer_adv = AdamW( optimizer_adv_grouped_parameters, lr=args.learning_rate, #max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) #tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = GPT2LMHeadModel.from_pretrained(args.output_dir) #tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() nb_eval_steps, nb_eval_examples = 0, 0 log_probs_sum = 0 perp = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch with torch.no_grad(): loss = model(input_ids, labels=input_ids)[0].detach().cpu().numpy() perp_batch = np.exp(loss) perp += perp_batch nb_eval_steps += 1 perp /= nb_eval_steps # perp_word = perp / 128 print(perp) result = {'eval_perp': perp} logger.info("***** Eval results *****") logger.info("'eval_perp' = %s", str(result['eval_perp'])) if args.do_probe: ##load model (how???) model_path = '/home/xiongyi/dataxyz/repos/pytorch-pretrained-BERT/examples/advJul232307/pytorch_model.bin' model.load_state_dict(torch.load(model_path)) ##Add a mlp to the representation probe_model = ProbeModel(model, config) probe_model.to(device) ##train and eval all_param = list(probe_model.named_parameters()) param_probe = [(n, p) for n, p in all_param if 'probe_cls' in n] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_probe if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_probe if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, # max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) # t_total=num_train_optimization_steps) train_results = {} nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None for epoch in trange(int(args.num_train_epochs), desc="Epoch"): ###eval on eval set probe_model.eval() nb_eval_steps, nb_eval_examples = 0, 0 average_loss = 0 average_acc = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch with torch.no_grad(): #breakpoint() loss = probe_model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[0].detach().cpu().numpy() pos_logits = probe_model( input_ids, labels=input_ids, pos_ids=input_pos_ids)[1].detach().cpu().numpy() predicted_labels = np.argmax(pos_logits, -1) correct_rate = np.mean(predicted_labels == input_pos_ids. detach().cpu().numpy()[:, 1:]) average_acc += correct_rate average_loss += loss nb_eval_steps += 1 average_loss /= nb_eval_steps ##TODO Hard CODED! average_acc /= nb_eval_steps print('loss', average_loss, ' acc_rate ', average_acc, ' epoch ', epoch) train_results[epoch] = (average_loss, average_acc) probe_model.train() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_pos_ids = batch loss = probe_model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[0] # breakpoint() # loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e}".format( exp_average_loss)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='gpt2-medium', help='pretrained model name') parser.add_argument("--do_train", action='store_true', default=True, help="Whether to run training.") parser.add_argument( "--output_dir", default='fintuned_gpt', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--dataset', type=str, default='', required=True) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--opt_level', type=str, default='O1') parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=8) parser.add_argument('--num_prior', type=int, default=2) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset. # start_token, delimiter_token, clf_token special_tokens_dict = { 'cls_token': '<|cls|>', 'unk_token': '<|unk|>', 'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'sep_token': '<|endoftext|>' } tokenizer = GPT2Tokenizer.from_pretrained(args.model_name) num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'tokens') #start_token, delimiter_token, clf_token special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in ['<|endoftext|>', '<|endoftext|>', '<|cls|>']) model = GPT2DoubleHeadsModel.from_pretrained(args.model_name) model.resize_token_embeddings(len(tokenizer)) model.to(device) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_dataset(tokenizer, args.dataset, num_prior=args.num_prior) eval_dataset = load_dataset(tokenizer, args.dataset, num_prior=args.num_prior) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, verbosity=1) nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for i, _ in enumerate(range(int(args.num_train_epochs))): print('Starting Epoch: {} of {}'.format( str(i + 1), str(int(args.num_train_epochs)))) tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) if torch.cuda.is_available(): torch.cuda.empty_cache() # Save a trained model # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir) model.to(device)
class Trainer: def __init__(self, args, config, model, criterion, train_dataloader, valid_dataloader, logger, save_path, tb_writer): self.args = args self.config = config self.model = model self.criterion = criterion self.train_dataloader = train_dataloader self.valid_dataloader = valid_dataloader self.logger = logger self.save_path = save_path self.tb_writer = tb_writer self.t_total = len(self.train_dataloader) * self.args.epoch self.device = self.config.device self.optimizer = AdamW(self.get_model_parameters(), lr=self.config.learning_rate) self.scheduler = WarmupLinearSchedule(self.optimizer, 0.1 * self.t_total, self.t_total) self.global_step = 0 self.best_eval_acc = 0.2 def get_model_parameters(self): # Optimizer & Loss param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] return optimizer_grouped_parameters def train(self, do_eval=True, do_save=True): for epoch in range(self.args.epoch): self.train_epoch(epoch) self.evaluation(epoch) self.write_to_tb() self.save_model(epoch) self.tb_writer.close() def transform_to_bert_input(self, batch): input_ids, valid_length, token_type_ids = batch[0], batch[1], batch[2] input_ids = torch.from_numpy(input_ids).to(self.device) valid_length = valid_length.clone().detach().to(self.device) token_type_ids = torch.tensor(token_type_ids).long().to(self.device) return input_ids, valid_length, token_type_ids def compute_acc(self, y_hat, y, mean=True): if mean: yhat = y_hat.max( dim=-1)[1] # [0]: max value, [1]: index of max value acc = (yhat == y).float().mean() # padding은 acc에서 제거 return acc else: correct_count = (yhat == y).long().sum() return correct_count def train_epoch(self, epoch): self.model.to(self.device) self.model.train() tr_correct_cnt, tr_total_cnt = 0, 0 tr_loss = 0.0 # train_loader = tqdm(self.train_dataloader) train_loader = self.train_dataloader for step, batch in enumerate(train_loader): self.model.zero_grad() sent1 = batch['sent1'] input_1, valid_length_1, token_type_1 = self.transform_to_bert_input( sent1) embed1 = self.model(input_1, valid_length_1, token_type_1) sent2 = batch['sent2'] input_2, valid_length_2, token_type_2 = self.transform_to_bert_input( sent2) embed2 = self.model(input_2, valid_length_2, token_type_2) label = batch['label'] label = torch.tensor(label).long().to(self.device) pred = self.model.get_logit(embed1, embed2) loss = self.criterion(pred, label.view(-1)) tr_loss += loss.item() loss.backward() if step > 0 and ( step) % self.config.gradient_accumulation_steps == 0: self.global_step += self.config.gradient_accumulation_steps self.optimizer.step() self.optimizer.zero_grad() self.scheduler.step() with torch.no_grad(): accuracy = self.compute_acc(pred, label) self.tr_acc = accuracy.item() self.tr_avg_loss = tr_loss / self.global_step if self.global_step % 100 == 0: #int(len(self.train_dataloader)/5) ==0: self.logger.info( 'epoch : {} /{}, global_step : {} /{}, tr_avg_loss: {:.3f}, tr_acc: {:.2%}' .format(epoch + 1, self.args.epoch, self.global_step, self.t_total, self.tr_avg_loss, self.tr_acc)) def evaluation(self, epoch): self.model.eval() eval_correct_cnt, eval_total_cnt = 0, 0 eval_loss = 0.0 eval_acc = 0.0 eval_step = 1 self.logger.info('*****************Evaluation*****************') valid_loader = tqdm(self.valid_dataloader) for step, batch in enumerate(valid_loader): with torch.no_grad(): sent1 = batch['sent1'] input_1, valid_length_1, token_type_1 = self.transform_to_bert_input( sent1) embed1 = self.model(input_1, valid_length_1, token_type_1) sent2 = batch['sent2'] input_2, valid_length_2, token_type_2 = self.transform_to_bert_input( sent2) embed2 = self.model(input_2, valid_length_2, token_type_2) label = batch['label'] label = torch.tensor(label).long().to(self.device) pred = self.model.get_logit(embed1, embed2) loss = self.criterion(pred, label.view(-1)) eval_loss += loss.item() acc = self.compute_acc(pred, label) eval_acc += acc.item() eval_step += 1.0 self.eval_avg_loss = eval_loss / eval_step self.eval_avg_acc = eval_acc / eval_step self.logger.info( 'epoch : {} /{}, global_step : {} /{}, eval_loss: {:.3f}, eval_acc: {:.2%}' .format(epoch + 1, self.args.epoch, self.global_step, self.t_total, self.eval_avg_loss, self.eval_avg_acc)) def save_model(self, epoch): if self.eval_avg_acc > self.best_eval_acc: self.best_eval_acc = self.eval_avg_acc self.model.to(torch.device('cpu')) state = { 'epoch': epoch + 1, 'model_state_dict': self.model.state_dict(), 'opt_state_dict': self.optimizer.state_dict() } save_model_path = '{}/epoch_{}_step_{}_tr_acc_{:.3f}_tr_loss_{:.3f}_eval_acc_{:.3f}_eval_loss_{:.3f}.pt'.format( self.save_path, epoch + 1, self.global_step, self.tr_acc, self.tr_avg_loss, self.eval_avg_acc, self.eval_avg_loss) # Delte previous checkpoint if len(glob.glob(self.save_path + '/epoch*.pt')) > 0: os.remove(glob.glob(self.save_path + '/epoch*.pt')[0]) torch.save(state, save_model_path) self.logger.info(' Model saved to {}'.format(save_model_path)) os.mkdir(self.save_path + '/epoch_{}_eval_loss_{:.3f}_eval_acc_{:.3f}'.format( epoch + 1, self.eval_avg_loss, self.eval_avg_acc)) def write_to_tb(self): self.tb_writer.add_scalars('loss', { 'train': self.tr_avg_loss, 'val': self.eval_avg_loss }, self.global_step) self.tb_writer.add_scalars('acc', { 'train': self.tr_acc, 'val': self.eval_avg_acc }, self.global_step)
def train(**kwargs): # kwargs.update({'model': 'CNN'}) opt.parse(kwargs) if (opt.use_gpu): torch.cuda.set_device(opt.gpu_id) if opt.encoder == 'BERT': encoder_model = BertForSequenceClassification.from_pretrained( "./downloaded_weights/downloaded_bert_base_uncased", num_labels=opt.rel_num) # print(encoder_model) opt.encoder_out_dimension = opt.rel_num else: encoder_model = getattr(encoder_models, opt.encoder)(opt) opt.encoder_out_dimension = encoder_model.out_dimension selector_model = getattr(selector_models, opt.selector)(opt) # encoder_model = torch.nn.DataParallel(encoder_model, device_ids=[3,6]) if (opt.use_gpu): encoder_model = encoder_model.cuda() selector_model = selector_model.cuda() # Loading data DataModel = getattr(dataset, opt.data + 'Data') train_data = DataModel(opt.data_root, train=True, use_bert=opt.use_bert_tokenizer) train_data_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, collate_fn=collate_fn) print('train data: {}'.format(len(train_data))) test_data = DataModel(opt.data_root, train=False, use_bert=opt.use_bert_tokenizer) test_data_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, collate_fn=collate_fn) print('test data: {}'.format(len(test_data))) criterion = nn.CrossEntropyLoss() if opt.encoder == 'BERT': optimizer = AdamW( [{ 'params': encoder_model.parameters() }, { 'params': selector_model.parameters() }], lr=opt.lr, correct_bias=True ) # To reproduce BertAdam specific behavior set correct_bias=False else: optimizer = optim.Adadelta([{ 'params': encoder_model.parameters() }, { 'params': selector_model.parameters() }], lr=opt.lr, rho=1.0, eps=1e-6, weight_decay=opt.weight_decay) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=2, t_total=3) # PyTorch scheduler ### and used like this: # for batch in train_data: # loss = model(batch) # loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue) # optimizer.zero_grad() # if opt.encoder == "BERT" and False: # optimizer = optim.SGD([ # {'params': selector_model.parameters()} # ], lr=opt.lr) # else: optimizer = optim.SGD([{ 'params': encoder_model.parameters() }, { 'params': selector_model.parameters() }], lr=opt.lr) max_pre = 0.0 max_rec = 0.0 for epoch in range(opt.num_epochs): # if opt.encoder == "BERT": encoder_model.train() selector_model.train() print("*" * 50) print("Epoch {}".format(epoch)) total_loss = 0 max_insNum = 0 for batch_num, (data, label_set) in enumerate(train_data_loader): # if (batch_num>2000): # break # label_set is the label of each bag (there may be no more than 4 labels, but we only wants the first) labels = [] outs = torch.empty([0, 53]) empty = True # if all labels of bags in one batch are zeros, then it's empty, continue to avoid error for l in label_set: if (l[0] != 0): labels.append(l[0]) empty = False if empty: continue # labels = [l[0] for l in label_set] # Each time enters {batch_size} bags # Each time I want one bag!! # The model need to give me a representation of an instance!!! if opt.use_gpu: labels = torch.LongTensor(labels).cuda() outs = outs.cuda() else: labels = torch.LongTensor(labels) optimizer.zero_grad() train_cor = 0 for idx, bag in enumerate(data): insNum = bag[1] # if insNum > max_insNum: # max_insNum = insNum # print(max_insNum) label = label_set[idx][0] # Label of the current bag if (label_set[idx][0] == 0): continue ins_outs = torch.empty(0) instances = bag[2] pf_list = [] mask_list = [] if opt.encoder != 'BERT': pf_list = bag[3] mask_list = bag[5] # pf_list = bag[3] ins_out = torch.empty(0) encoder_model.batch_size = insNum if opt.use_gpu: instances = torch.LongTensor(instances).cuda() if opt.encoder == 'BERT': # with torch.no_grad(): # print(instances.size(0)) if insNum > opt.max_sentence_in_bag: ins_outs = encoder_model( instances[:opt.max_sentence_in_bag])[0] else: ins_outs = encoder_model(instances)[0] # ins_outs = ins_outs[0] # print(ins_outs[0].size()) else: for idx, instance in enumerate(instances): if opt.use_gpu: pfs = torch.LongTensor(pf_list[idx]).cuda() masks = torch.LongTensor(mask_list[idx]).cuda() else: pfs = torch.LongTensor(pf_list[idx]) masks = torch.LongTensor(mask_list[idx]) if opt.encoder == 'PCNN': ins_out = encoder_model(instance, pfs, masks) else: ins_out = encoder_model(instance, pfs) if (opt.use_gpu): ins_out = ins_out.cuda() ins_outs = ins_outs.cuda() ins_outs = torch.cat((ins_outs, ins_out), 0) del instance, ins_out if idx >= opt.max_sentence_in_bag: break bag_feature = selector_model(ins_outs) if opt.use_gpu: bag_feature = bag_feature.cuda() if (torch.max(bag_feature.squeeze(), 0)[1] == label): train_cor += 1 outs = torch.cat((outs, bag_feature), 0) del ins_outs, bag_feature # outs = outs.squeeze() # print("outs.size(): ", outs.size(), '\n', "labels.size(): ", labels.size()) # print(outs,labels) loss = criterion(outs, labels) total_loss += loss.item() avg_loss = total_loss / (batch_num + 1) sys.stdout.write( "\rbatch number: {:6d}\tloss: {:7.4f}\ttrain_acc: {:7.2f}\t". format(batch_num, avg_loss, train_cor / len(labels))) sys.stdout.flush() # sys.stdout.write('\033') loss.backward() if opt.encoder == 'BERT': scheduler.step() optimizer.step() del outs, labels if (opt.skip_predict != True): with torch.no_grad(): predict(encoder_model, selector_model, test_data_loader) t = time.strftime('%m_%d_%H_%M.pth') torch.save(encoder_model.state_dict(), 'checkpoints/{}_{}'.format(opt.encoder, t)) torch.save(selector_model.state_dict(), 'checkpoints/{}_{}'.format(opt.selector, t))
def main(): parser = argparse.ArgumentParser() parser = add_xlmr_args(parser) parser.add_argument('--self_training', action='store_true', default=False) parser.add_argument('--unlabeled_data_dir', type=str, default='data/unlabeled_data') parser.add_argument('--self_training_confidence', type=float, default=0.9) parser.add_argument('--K', type=float, default=50) parser.add_argument('--patience', type=float, default=10) args = parser.parse_args() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) data_processor = SequenceLabelingProcessor(task=args.task_name) label_list = data_processor.get_labels() num_labels = len(label_list) + 1 # add one for IGNORE label train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = data_processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # preparing model configs hidden_size = 768 if 'base' in args.pretrained_path else 1024 # TODO: move this inside model.__init__ device = 'cuda' if (torch.cuda.is_available() and not args.no_cuda) else 'cpu' if args.use_crf: model_cls = XLMRForTokenClassificationWithCRF else: model_cls = XLMRForTokenClassification # creating model model = model_cls(pretrained_path=args.pretrained_path, n_labels=num_labels, hidden_size=hidden_size, dropout_p=args.dropout, device=device) model.to(device) if args.load_model is not None: logging.info("Loading saved model {}".format(args.load_model)) state_dict = torch.load(args.load_model) model.load_state_dict(state_dict, strict=True) no_decay = ['bias', 'final_layer_norm.weight'] params = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) # freeze model if necessary if args.freeze_model: logger.info("Freezing XLM-R model...") for n, p in model.named_parameters(): if 'xlmr' in n and p.requires_grad: p.requires_grad = False if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = data_processor.convert_examples_to_features( train_examples, label_list, args.max_seq_length, model.encode_word) if args.self_training: self_training_examples = data_processor.get_unlabeled_examples( args.unlabeled_data_dir) self_training_features = data_processor.convert_examples_to_features( self_training_examples, label_list, args.max_seq_length, model.encode_word) logging.info("Loaded {} Unlabeled examples".format( len(self_training_examples))) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) train_data = create_ner_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) val_examples = data_processor.get_dev_examples(args.data_dir) val_features = data_processor.convert_examples_to_features( val_examples, label_list, args.max_seq_length, model.encode_word) val_data = create_ner_dataset(val_features) best_val_f1 = 0.0 ############################# Self Training Loop ###################### n_iter = 0 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) patience = 0 while 1: ############################ Inner Training Loop ##################### #if n_iter >= 50: # break # reset lr n_iter += 1 print(len(train_dataloader)) loss_fct = nn.BCELoss() for epoch_ in tqdm(range(args.num_train_epochs), desc="Epoch", disable=args.no_pbar): tr_loss = 0 tbar = tqdm(train_dataloader, desc="Iteration", disable=args.no_pbar) model.train() for step, batch in enumerate(tbar): batch = tuple(t.to(device) for t in batch) input_ids, label_ids, l_mask, valid_ids, = batch loss, _ = model(input_ids, label_ids, l_mask, valid_ids, get_sent_repr=True) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() tbar.set_description('Loss = %.4f' % (tr_loss / (step + 1))) logger.info("Evaluating on validation set...\n") #torch.save(model.state_dict(), open(os.path.join(args.output_dir, 'model.pt'), 'wb')) f1, report = evaluate_model_seq_labeling( model, val_data, label_list, args.eval_batch_size, args.use_crf, device) if f1 > best_val_f1: best_val_f1 = f1 logger.info( "\nFound better f1=%.4f on validation set. Saving model\n" % (f1)) logger.info("\n%s\n" % (report)) torch.save( model.state_dict(), open(os.path.join(args.output_dir, 'model.pt'), 'wb')) patience = 0 else: logger.info("\nNo better F1 score: {}\n".format(f1)) patience += 1 ###################################################################### if not args.self_training: break if patience >= args.patience: logger.info("No more patience. Existing") break ## get confidence and update train_data, train_dataloader # convert unlabeled examples to features if len(self_training_features) <= 0: # no more self-training data break confident_features, self_training_features = get_top_confidence_samples_seq_labeling( model, self_training_features, batch_size=args.eval_batch_size, K=args.K) for f in confident_features: l_ids = f.label_id l_s = [label_map[i] for i in l_ids] logging.info("Got %d confident samples" % (len(confident_features))) # append new features #train_features = data_processor.convert_examples_to_features( # train_examples, label_list, args.max_seq_length, model.encode_word) train_features.extend(confident_features) print("now we have %d total examples" % len(train_features)) train_data = create_ner_dataset(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for g in optimizer.param_groups: g['lr'] = args.learning_rate scheduler.step(0) #print("Loading best last model...") #model.load_state_dict(torch.load(open(os.path.join(args.output_dir, 'model.pt'), 'rb'))) # load best/ saved model state_dict = torch.load( open(os.path.join(args.output_dir, 'model.pt'), 'rb')) model.load_state_dict(state_dict) logger.info("Loaded saved model") model.to(device) if args.do_eval: if args.eval_on == "dev": eval_examples = data_processor.get_dev_examples(args.data_dir) elif args.eval_on == "test": eval_examples = data_processor.get_test_examples(args.data_dir) else: raise ValueError("eval on dev or test set only") eval_features = data_processor.convert_examples_to_features( eval_examples, label_list, args.max_seq_length, model.encode_word) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_data = create_ner_dataset(eval_features) f1_score, report = evaluate_model_seq_labeling(model, eval_data, label_list, args.eval_batch_size, args.use_crf, device) logger.info("\n%s", report) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") logger.info("dataset = {}".format(args.data_dir)) logger.info("model = {}".format(args.output_dir)) with open(output_eval_file, "w") as writer: logger.info("***** Writing results to file *****") writer.write(report) logger.info("Done.")
def main(): parser = argparse.ArgumentParser() ## Required parameters(即required=True的参数必须在命令上出现) parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "数据集路径. The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="模型类型(这里为bert). Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "下载好的预训练模型. Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "模型预测和断点文件的存放路径. The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help= "预训练的配置名字或路径. Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "预训练分词器名字或路径. Pretrained tokenizer name or path if not the same as model_name" ) parser.add_argument( "--cache_dir", default="", type=str, help= "从亚马逊s3下载的预训练模型存放路径. Where do you want to store the pre-trained models downloaded from s3" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "最长序列长度. The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="是否训练. Whether to run training.") parser.add_argument("--do_test", action='store_true', help="是否测试. Whether to run testing.") parser.add_argument("--predict_eval", action='store_true', help="是否预测验证集. Whether to predict eval set.") parser.add_argument("--do_eval", action='store_true', help="是否验证. Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="是否训练中跑验证. Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="是否用小写模型. Set this flag if you are using an uncased model.") parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="训练时每个GPU/CPU上的batch size. Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="验证时每个GPU/CPU上的batch size. Batch size per GPU/CPU for evaluation." ) parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "反向传播前梯度累计的次数. Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="Adam的初始学习率. The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="权重衰减系数. Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Adam的Epsilon系数. Epsilon for Adam optimizer.") parser.add_argument( "--max_grad_norm", default=1.0, type=float, help= " 如果所有参数的gradient组成的向量的L2 norm大于max norm,那么需要根据L2 norm/max_norm进行缩放。从而使得L2 norm小于预设的clip_norm. Max gradient norm." ) parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="训练epoch数. Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument( "--warmup_steps", default=0, type=int, help="线性warmup的steps. Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="测试集划分. text split") parser.add_argument('--logging_steps', type=int, default=50, help="日志更新steps. Log every X updates steps.") parser.add_argument( '--save_steps', type=int, default=50, help="断点文件保存steps. Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "评估所有的断点. Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="不用cuda. Avoid using CUDA when available") parser.add_argument( '--overwrite_output_dir', action='store_true', help="重写输出路径. Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="重写训练和评估的缓存. Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="初始化用的随机种子. random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "是否用16位混合精度. Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "fp16的优化level. For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="为了分布式训练. For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="远程debug用的ip. For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="远程debug用的端口. For distant debugging.") parser.add_argument("--freeze", default=0, type=int, required=False, help="冻结BERT. freeze bert.") parser.add_argument("--not_do_eval_steps", default=0.35, type=float, help="not_do_eval_steps.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: # 如果无指定GPU或允许使用CUDA,就使用当前所有GPU device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # 指定使用哪个GPU(local_rank代表当前程序进程使用的GPU标号) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging 初始化日志 logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed 设置种子数 set_seed(args) # 创建存放路径 try: os.makedirs(args.output_dir) except: pass # 载入预训练好的BERT分词器 tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # 载入预设好的BERT配置文件 config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=2) # Prepare model 载入并配置好基于BERT的序列分类模型 model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) # 开启FP16 if args.fp16: model.half() model.to(device) # 如果是指定了单个GPU,用DistributedDataParallel进行GPU训练 if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) # 如果有多个GPU,就直接用torch.nn.DataParallel,会自动调用当前可用的多个GPU elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # 总batch size = GPU数量 * 每个GPU上的mbatch size args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader 导入数据并准备符合格式的输入 train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # 如果无指定GPU就随机采样,如果指定了GPU就分布式采样 if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) # 准备dataloader train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) # 训练steps num_train_optimization_steps = args.train_steps # Prepare optimizer 准备优化器 param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] # no_dacay内的参数不参与权重衰减 # BN是固定C,[B,H,W]进行归一化处理(处理为均值0,方差1的正太分布上),适用于CNN # LN是固定N,[C,H,W]进行归一化处理,适用于RNN(BN适用于固定深度的前向神经网络,而RNN因输入序列长度不一致而深度不固定,因此BN不合适,而LN不依赖于batch的大小和输入sequence的深度,因此可以用于batchsize为1和RNN中对边长的输入sequence的normalize操作) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # 配置优化器和warmup机制 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps // args.gradient_accumulation_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # 循环遍历 # 先做一个eval for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data 准备验证集的dataloader eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # 开启预测模式(不用dropout和BN) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: # 将数据放在GPU上 input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) # 禁止进行梯度更新 with torch.no_grad(): tmp_eval_loss, logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps # 计算验证集的预测损失 eval_accuracy = accuracy(inference_logits, gold_labels) # 计算验证集的预测准确性 result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step } # 将验证集的预测评价写入到evel_results.txt中 output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') # 如果当前训练的模型表现最佳,则保存该模型 if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) model.train() # 分batch循环迭代训练模型 for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) nb_tr_examples += input_ids.size(0) del input_ids, input_mask, segment_ids, label_ids if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_steps += 1 # 用FP16去做反向传播 if args.fp16: optimizer.backward(loss) else: loss.backward() # 梯度累计后进行更新 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() # 梯度更新 scheduler.step() # 梯度更新 optimizer.zero_grad() # 清空现有梯度,避免累计 global_step += 1 # 每隔args.eval_steps*args.gradient_accumulation_steps,打印训练过程中的结果 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # 每隔args.eval_steps*args.gradient_accumulation_steps,预测验证集并评估结果 if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and ( step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) # 预测测试集 if args.do_test: del model gc.collect() # 清理内存 args.do_train = False # 停止训练 # 载入训练好的的最佳模型文件 model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: # nn.Module中的half()方法将模型中的float32转化为float16 model.half() model.to(device) # 将模型放在GPU上 # 设置GPU训练方式 if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # 预测验证集和测试集 for file, flag in [('dev.csv', 'dev'), ('CSC_test.csv', 'CSC_test'), ('NS_test.csv', 'NS_test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) # 保存预测结果文件 if flag == 'CSC_test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['qid', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_CSC.csv"), index=False) if flag == 'NS_test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['qid', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_NS.csv"), index=False) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False) # 只预测验证集 if args.predict_eval: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False)
def train(self, dataloader: DataLoader, train_config: TrainConfig): """ Train the model with the given data and config :param dataloader: the data for the training :param train_config: the configuration for the training """ if train_config.output_path is not None: os.makedirs(train_config.output_path, exist_ok=True) if os.listdir(train_config.output_path): raise ValueError("Output directory ({}) already exists and is not empty.".format( train_config.output_path)) self.save(train_config.output_path, save_config=True, save_model=False) self.best_score = -9999 num_train_steps = int(len(dataloader) / train_config.gradient_accumulation_steps * train_config.epochs) # Prepare optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': train_config.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if train_config.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = AdamW(optimizer_grouped_parameters, lr=train_config.learning_rate, eps=train_config.adam_epsilon, correct_bias=train_config.correct_bias) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_config.warmup_steps, t_total=t_total) if train_config.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(self.model, optimizer, opt_level=train_config.fp16_opt_level) global_step = 0 for epoch in trange(train_config.epochs, desc="Epoch"): training_steps = 0 self.model.train() for step, batch in enumerate(tqdm(dataloader, desc="Iteration")): batch = batch_to_device(batch, self.device) input_ids, segment_ids, input_masks, label_ids = batch loss = self.model(input_ids, segment_ids, input_masks, label_ids) if train_config.gradient_accumulation_steps > 1: loss = loss / train_config.gradient_accumulation_steps if train_config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), train_config.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), train_config.max_grad_norm) training_steps += 1 if (step + 1) % train_config.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if train_config.evaluation_steps > 0 and training_steps % train_config.evaluation_steps == 0: self._eval_during_training(train_config, epoch, training_steps) self.model.train() self._eval_during_training(train_config, epoch, -1)
def train(self): model = BertForSequenceClassification.from_pretrained( self.args.model_name_or_path, self.args, config=self.config) model.to(self.device) logger.info('准备数据') data = DATABDCI( debug=False, data_dir='/home/lsy2018/文本匹配/DATA/DATA_BDCI/', data_process_output='/home/lsy2018/文本匹配/DATA/DATA_BDCI/data_1014/') train_examples = data.read_examples( os.path.join(self.data_process_output, 'train.csv')) train_features = data.convert_examples_to_features( train_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # 这步干嘛的? train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size // self.gradient_accumulation_steps) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) best_acc = 0 global_step = 0 model.train() train_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(self.train_steps), total=self.train_steps) train_dataloader = cycle(train_dataloader) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) train_loss += loss.item() train_loss = round( train_loss * self.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if self.do_eval and (step + 1) % ( self.eval_steps * self.gradient_accumulation_steps) == 0: inference_labels = [] scores = [] gold_labels = [] inference_logits = [] eval_examples = data.read_examples( os.path.join(self.data_process_output, 'dev.csv')) eval_features = data.convert_examples_to_features( eval_examples, self.tokenizer, self.max_seq_length) ID1 = [x.sentence_ID1 for x in eval_examples] ID2 = [x.sentence_ID2 for x in eval_examples] all_input_ids = torch.tensor(data.select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 count = 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: # ID1_list_eachbatch = ID1[count*args.eval_batch_size:(count+1)*args.eval_batch_size] # ID2_list_eachbatch = ID2[count * args.eval_batch_size:(count + 1) * args.eval_batch_size] input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) # scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) # scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) # eval_mrr = compute_MRR(scores, gold_labels, ID1, ID2) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, # 'mrr':eval_mrr, 'loss': train_loss } output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def multitask_train(self, dataloaders: List[DataLoader], losses: List[LossFunction], train_config: TrainConfig): """ Train the model with the given data and config with the given loss for each dataset Each dataloader is sampled in turn for one batch. We sample only as many batches from each dataloader as there are in the smallest one to make sure of equal training with each dataset. :param dataloaders: the data for the training :param losses: the losses for the dataloaders the losses still uses the configuration as given in sbert_config, so you cannot for example have two different SBERTLossFunction.SOFTMAX with different number of labels :param train_config: the configuration for the training """ if train_config.output_path is not None: os.makedirs(train_config.output_path, exist_ok=True) if os.listdir(train_config.output_path): raise ValueError("Output directory ({}) already exists and is not empty.".format( train_config.output_path)) self.save(train_config.output_path, save_config=True, save_model=False) self.best_score = -9999 min_batches = min([len(dataloader) for dataloader in dataloaders]) num_dataloaders = len(dataloaders) num_train_steps = int(num_dataloaders*min_batches / train_config.gradient_accumulation_steps * train_config.epochs) # Prepare optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if train_config.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = AdamW(optimizer_grouped_parameters, lr=train_config.learning_rate, eps=train_config.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_config.warmup_steps, t_total=t_total) if train_config.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(self.model, optimizer, opt_level=train_config.fp16_opt_level) global_step = 0 for epoch in trange(train_config.epochs, desc="Epoch"): training_steps = 0 self.model.train() iterators = [iter(dataloader) for dataloader in dataloaders] for step in trange(num_dataloaders*min_batches, desc="Iteration"): idx = step % num_dataloaders batch = batch_to_device(next(iterators[idx]), self.device) input_ids, segment_ids, input_masks, label_ids = batch loss = self.model(input_ids, segment_ids, input_masks, label_ids, losses[idx]) if train_config.gradient_accumulation_steps > 1: loss = loss / train_config.gradient_accumulation_steps if train_config.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), train_config.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), train_config.max_grad_norm) training_steps += 1 if (step + 1) % train_config.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if train_config.evaluation_steps > 0 and training_steps % train_config.evaluation_steps == 0: self._eval_during_training(train_config, epoch, training_steps) self.model.train() self._eval_during_training(train_config, epoch, -1)
if fold not in [test_fold, val_fold]] acc = defaultdict(lambda: None) for epoch in range(args.n_epochs): print("Epoch:", epoch + 1) model.train() train_loss = val_loss = test_loss = 0 for row in train_data: outputs = model(long_tensor(row["text_tokens"]).unsqueeze(0), labels=long_tensor( row["source_label"]).unsqueeze(0)) loss, logits = outputs[:2] train_loss += loss.item() optimizer.zero_grad() loss.backward() optimizer.step() print("train_loss:", train_loss) model.eval() with torch.no_grad(): accs = defaultdict(list) for row in val_data: outputs = model(long_tensor(row["text_tokens"]).unsqueeze(0), labels=long_tensor( row["source_label"]).unsqueeze(0)) loss, logits = outputs[:2] val_loss += loss.item()
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader() num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(self.model_name_or_path,self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids,input_mask,segment_ids,\ utterance_mask,domain_mask, \ slot_mask,hist_mask,\ label_value_start,label_value_end,\ label_domainslot = batch loss_tokenstart,loss_tokenend,loss_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask = utterance_mask, domain_mask = domain_mask, slot_mask = slot_mask, hist_mask = hist_mask, label_value_start=label_value_start, label_value_end = label_value_end, label_domainslot = label_domainslot ) loss = loss_tokenstart + loss_tokenend + loss_domainslot # loss = loss_domainslot tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['de.csv']: gold_value_start = [] gold_value_end = [] gold_domainslot = [] scores_value_start = [] scores_value_end = [] scores_domainslot = [] dialogueID = [x.guid for x in eval_examples] utterance_text = [x.text_eachturn for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss_tokens_start,eval_loss_tokens_end,eval_loss_domainslot = 0,0,0 eval_F1_tokens_start,eval_F1_tokens_end = 0,0 eval_F1_sentence_domainslot,eval_F1_tokens_domainslot = 0,0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids,input_mask, segment_ids,\ utterance_mask,domain_mask, \ slot_mask,hist_mask,\ label_value_start,label_value_end,\ label_domainslot in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) utterance_mask = utterance_mask.to(self.device) domain_mask = domain_mask.to(self.device) slot_mask = slot_mask.to(self.device) hist_mask = hist_mask.to(self.device) label_value_start = label_value_start.to(self.device) label_value_end = label_value_end.to(self.device) label_domainslot = label_domainslot.to(self.device) with torch.no_grad(): batch_eval_loss_value_start,batch_eval_loss_value_end,batch_eval_loss_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask = utterance_mask, domain_mask = domain_mask, slot_mask = slot_mask, hist_mask = hist_mask, label_value_start = label_value_start, label_value_end=label_value_end, label_domainslot=label_domainslot ) logits_value_start,logits_value_end,logits_domainslot = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask = utterance_mask, domain_mask = domain_mask, slot_mask = slot_mask, hist_mask = hist_mask, ) logits_value_start = logits_value_start.cpu().numpy() logits_value_end = logits_value_end.cpu().numpy() logits_domainslot = logits_domainslot.cpu().numpy() label_value_start = label_value_start.to('cpu').numpy() label_value_end = label_value_end.to('cpu').numpy() label_domainslot = label_domainslot.to('cpu').numpy() scores_value_start.append(logits_value_start) scores_value_end.append(logits_value_end) scores_domainslot.append(logits_domainslot) gold_value_start.append(label_value_start) gold_value_end.append(label_value_end) gold_domainslot.append(label_domainslot) eval_loss_tokens_start += batch_eval_loss_value_start.mean().item() eval_loss_tokens_end += batch_eval_loss_value_end.mean().item() eval_loss_domainslot += batch_eval_loss_domainslot.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_value_start = np.concatenate(gold_value_start,0) gold_value_end = np.concatenate(gold_value_end,0) gold_domainslot = np.concatenate(gold_domainslot,0) scores_value_start = np.concatenate(scores_value_start, 0) scores_value_end = np.concatenate(scores_value_end, 0) scores_domainslot = np.concatenate(scores_domainslot,0) model.train() eval_loss_tokens_start = eval_loss_tokens_start/nb_eval_steps eval_loss_tokens_end = eval_loss_tokens_end / nb_eval_steps eval_loss_domainslot = eval_loss_domainslot /nb_eval_steps # print(scores_domainslot.shape) # print(gold_labels_domainslot.shape) # print(scores_domainslot) # print(gold_labels_domainslot) # exit() # eval_accuracy_token_start = accuracyF1(scores_domain, gold_labels_domain,mode='domain') # eval_accuracy_token_end = accuracyF1(scores_dependcy, gold_labels_dependcy ,mode= 'dependcy') eval_F1_valuestart,eval_F1_valueend,F1_domainslot = compute_jointGoal_domainslot( dialogueID, utterance_text, scores_value_start, scores_value_end, scores_domainslot, gold_value_start, gold_value_end, gold_domainslot ) print( 'F1_domainslot',F1_domainslot, 'eval_F1_valuestart',eval_F1_valuestart, 'eval_F1_valueend', eval_F1_valueend, 'global_step',global_step, 'loss',train_loss ) result = { 'eval_loss_tokens_start':eval_loss_tokens_start, 'eval_loss_tokens_end': eval_loss_tokens_end, 'eval_loss_domainslot':eval_loss_domainslot, 'F1_domainslot': F1_domainslot, 'eval_F1_valuestart': eval_F1_valuestart, 'eval_F1_valueend': eval_F1_valueend, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_F1_valuestart > best_acc : print("=" * 80) print("Best jointGoal", eval_F1_valuestart) print("Saving Model......") # best_acc = eval_accuracy best_acc = eval_F1_valuestart # Save a trained model model_to_save = model.module if hasattr(model,'module') else model output_model_file = os.path.join(self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) # Prepare model model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and (step + 1) % ( args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if args.do_test: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False)
def main(): output_directory = "dofus-v2" num_train_epochs = 3 train_batch_size = 4 eval_batch_size = 2 max_context_length = 512 learning_rate = 6.25e-5 weight_decay = 0.01 nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None global_step = 0 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not os.path.exists(output_directory): os.makedirs(output_directory) # Load tokenizer and model tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel.from_pretrained("gpt2") model.to(device) train_data_loader, eval_data_loader = get_data_loaders( train_batch_size, eval_batch_size, max_context_length=max_context_length, device=device) # Preparing the optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, weight_decay=weight_decay) lr_scheduler.ExponentialLR(optimizer, 0.5) # Training the model model.train() previous_loss = float("inf") for _ in trange(num_train_epochs, desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_data_loader, desc="Training") for step, batch_element in enumerate(tqdm_bar): try: losses = model(batch_element, labels=batch_element) loss = losses[0] loss.backward() optimizer.step() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 global_step += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.defaults["lr"]) if step % 1000 == 0: save(model, tokenizer, output_directory) if step % 1000 == 0: log_tensorboard(model, writer, global_step, exp_average_loss, tokenizer, device) optimizer.zero_grad() except RuntimeError: print("There was a runtime error with batch:", batch_element) previous_loss = evaluate(model, tokenizer, eval_data_loader, tr_loss, previous_loss, nb_tr_steps, global_step, output_directory) model.train() save(model, tokenizer, output_directory) # Evaluating evaluate(model, eval_data_loader, device, tr_loss, nb_tr_steps, global_step, output_directory)
def fit(model, training_iter, eval_iter, num_train_steps, device, n_gpu, verbose=1): # ------------------结果可视化------------------------ if args.local_rank in [-1, 0]: TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S/}".format(datetime.now()) tb_writer = SummaryWriter('log/%s'%TIMESTAMP) # ---------------------优化器------------------------- param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] t_total = num_train_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)#int(t_total*args.warmup_proportion) # ---------------------GPU半精度fp16----------------------------- if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # ---------------------模型初始化---------------------- model.to(device) tr_loss, logging_loss = 0.0, 0.0 # ------------------------训练------------------------------ best_f1 = 0 #start = time.time() global_step = 0 set_seed(args, n_gpu) # Added here for reproductibility (even between python 2 and 3) bar = tqdm(range(t_total), total = t_total) nb_tr_examples, nb_tr_steps = 0, 0 for step in bar: model.train() batch = next(training_iter) batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} encode = model(**inputs) encode = encode[0]#提取预测结果 loss = model.loss_fn(encode, labels=inputs['labels']) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() #torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() #torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() train_loss = round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += inputs['input_ids'].size(0) nb_tr_steps += 1 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule optimizer.zero_grad() global_step += 1 if (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.local_rank in [-1, 0] and \ args.do_eval and (step+1)%(args.eval_steps*args.gradient_accumulation_steps)==0: # -----------------------验证---------------------------- model.eval() y_predicts, y_labels = [], [] eval_loss, eval_acc, eval_f1 = 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for _, batch in enumerate(eval_iter): batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3]} with torch.no_grad(): encode = model(**inputs) encode = encode[0] # 提取预测结果 eval_los = model.loss_fn(encode, labels=inputs['labels']) predicts = model.predict(encode)#.detach().cpu().numpy() nb_eval_examples += inputs['input_ids'].size(0) nb_eval_steps += 1 eval_loss += eval_los.mean().item() y_predicts.append(torch.from_numpy(predicts)) labels = inputs['labels'].view(1, -1) labels = labels[labels != -1] y_labels.append(labels) eval_loss = eval_loss / nb_eval_steps eval_predicted = torch.cat(y_predicts, dim=0).cpu().numpy() eval_labeled = torch.cat(y_labels, dim=0).cpu().numpy() eval_f1 = model.acc_rec_f1(eval_predicted, eval_labeled)#eval_acc, eval_rec, logger.info( '\n\nglobal_step %d - train_loss: %4f - eval_loss: %4f - eval_f1:%4f\n' % (global_step, train_loss, eval_loss, eval_f1)) # 保存最好的模型 if eval_f1 > best_f1: best_f1 = eval_f1 save_model(model, args.output_dir) if args.local_rank in [-1, 0]: tb_writer.add_scalar('train_loss', train_loss, step)#.item() tb_writer.add_scalar('eval_loss', eval_loss, step)#.item() / count tb_writer.add_scalar('eval_f1', eval_f1, step)#eval_acc tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) if args.local_rank in [-1, 0]: tb_writer.close()
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader( ) num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained( self.model_name_or_path, self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_domain, label_dependcy = batch loss_domain, loss_dependcy = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domain=label_domain, label_dependcy=label_dependcy) loss = loss_domain + loss_dependcy tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % ( self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels_domain = [] gold_labels_dependcy = [] inference_logits = [] scores_domain = [] scores_dependcy = [] ID = [x.guid for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss_domain, eval_loss_dependcy, eval_accuracy_domain, eval_accuracy_dependcy = 0, 0, 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_domain, label_dependcy in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_domain = label_domain.to(self.device) label_dependcy = label_dependcy.to(self.device) with torch.no_grad(): batch_eval_loss_domain, batch_eval_loss_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label_domain=label_domain, label_dependcy=label_dependcy) logits_domain, logits_dependcy = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits_domain = logits_domain.view( -1, self.num_labels_domain).detach().cpu().numpy() logits_dependcy = logits_dependcy.view( -1, self.num_labels_dependcy).detach().cpu().numpy() label_domain = label_domain.view(-1).to('cpu').numpy() label_dependcy = label_dependcy.view(-1).to( 'cpu').numpy() scores_domain.append(logits_domain) scores_dependcy.append(logits_dependcy) gold_labels_domain.append(label_domain) gold_labels_dependcy.append(label_dependcy) eval_loss_domain += batch_eval_loss_domain.mean().item( ) eval_loss_dependcy += batch_eval_loss_dependcy.mean( ).item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels_domain = np.concatenate(gold_labels_domain, 0) gold_labels_dependcy = np.concatenate( gold_labels_dependcy, 0) scores_domain = np.concatenate(scores_domain, 0) scores_dependcy = np.concatenate(scores_dependcy, 0) model.train() eval_loss_domain = eval_loss_domain / nb_eval_steps eval_loss_dependcy = eval_loss_dependcy / nb_eval_steps eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain, mode='domain') eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy, mode='dependcy') print('eval_F1_domain', eval_accuracy_domain, 'eval_F1_dependcy', eval_accuracy_dependcy, 'global_step', global_step, 'loss', train_loss) result = { 'eval_loss_domain': eval_loss_domain, 'eval_loss_dependcy': eval_loss_dependcy, 'eval_F1_domain': eval_accuracy_domain, 'eval_F1_dependcy': eval_accuracy_dependcy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy_domain > best_acc: print("=" * 80) print("Best F1", eval_accuracy_domain) print("Saving Model......") # best_acc = eval_accuracy best_acc = eval_accuracy_domain # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)