def train_bert(config: PipeLineConfig): logging.basicConfig(level=logging.INFO) logging.info("Reading data...") input_folder = "../input/jigsaw-unintended-bias-in-toxicity-classification/" train = pd.read_csv(os.path.join(input_folder, "train.csv")) logging.info("Tokenizing...") with multiprocessing.Pool(processes=32) as pool: text_list = train.comment_text.tolist() sequences = pool.map(convert_line_uncased, text_list) logging.info("Building ttensors for training...") sequences = np.array(sequences) lengths = np.argmax(sequences == 0, axis=1) lengths[lengths == 0] = sequences.shape[1] logging.info("Bulding target tesnor...") iden = train[IDENTITY_COLUMNS].fillna(0).values subgroup_target = np.hstack( [ (iden >= 0.5).any(axis=1, keepdims=True).astype(np.int), iden, iden.max(axis=1, keepdims=True), ] ) sub_target_weigths = ( ~train[IDENTITY_COLUMNS].isna().values.any(axis=1, keepdims=True) ).astype(np.int) weights = np.ones(len(train)) weights += (iden >= 0.5).any(1) weights += (train["target"].values >= 0.5) & (iden < 0.5).any(1) weights += (train["target"].values < 0.5) & (iden >= 0.5).any(1) weights /= weights.mean() y_aux_train = train[AUX_TARGETS] y_train_torch = torch.tensor( np.hstack( [ train.target.values[:, None], weights[:, None], y_aux_train, subgroup_target, sub_target_weigths, ] ) ).float() perfect_output = torch.tensor( np.hstack([train.target.values[:, None], y_aux_train, subgroup_target]) ).float() logging.info("Seeding with seed %d ...", config.seed) seed_everything(config.seed) logging.info("Creating dataset...") dataset = data.TensorDataset( torch.from_numpy(sequences).long(), y_train_torch, torch.from_numpy(lengths) ) train_loader = data.DataLoader( dataset, batch_size=BATCH_SIZE, collate_fn=clip_to_max_len, shuffle=True ) logging.info("Creating a model...") model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=18 ) model.zero_grad() model = model.cuda() model.classifier.bias = nn.Parameter(perfect_bias(perfect_output.mean(0)).cuda()) logs_file = f"./tb_logs/final_{config.expname}" optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if should_decay(n)], "weight_decay": config.decay, }, { "params": [p for n, p in model.named_parameters() if not should_decay(n)], "weight_decay": 0.00, }, ] optimizer = BertAdam( optimizer_grouped_parameters, lr=config.lr, warmup=config.warmup, t_total=config.epochs * len(train_loader) // ACCUM_STEPS, ) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = model.train() writer = SummaryWriter(logs_file) agg = TensorboardAggregator(writer) custom_loss = prepare_loss(config) for _ in range(config.epochs): for j, (X, y) in enumerate(train_loader): X = X.cuda() y = y.cuda() y_pred = model(X, attention_mask=(X > 0)) loss = custom_loss(y_pred, y) accuracy = ((y_pred[:, 0] > 0) == (y[:, 0] > 0.5)).float().mean() agg.log({"train_loss": loss.item(), "train_accuracy": accuracy.item()}) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (j + 1) % ACCUM_STEPS == 0: optimizer.step() optimizer.zero_grad() torch.save(model.state_dict(), f"./models/final-pipe2-{config.expname}.bin")
def train(self): if self.debug_mode: self.epochs = 1 # 加载 dataloader train_loader, valid_loader = self.create_dataloader() # 训练 self.seed_everything() lr = 2e-5 accumulation_steps = math.ceil(self.batch_size / self.base_batch_size) # 预训练 bert 转成 pytorch if os.path.exists(self.work_dir + 'pytorch_model.bin') is False: print("Convert pre-trained model") convert_tf_checkpoint_to_pytorch.convert_tf_checkpoint_to_pytorch( self.bert_model_path + 'bert_model.ckpt', self.bert_model_path + 'bert_config.json', self.work_dir + 'pytorch_model.bin') shutil.copyfile(self.bert_model_path + 'bert_config.json', self.work_dir + 'bert_config.json') # 加载预训练模型 print("Load pre-trained model") model = BertNeuralNet.from_pretrained(self.work_dir, cache_dir=None) model.zero_grad() model = model.to(self.device) # 不同的参数组设置不同的 weight_decay param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] epoch_steps = int(self.train_len * 0.5 / self.base_batch_size / accumulation_steps) num_train_optimization_steps = int(self.epochs * epoch_steps) valid_every = math.floor(epoch_steps * accumulation_steps / 5) optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05, t_total=num_train_optimization_steps) # 渐变学习速率 #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # 开始训练 print("Train") best_auc_score_1 = 0 best_auc_score_2 = 0 best_auc_score_3 = 0 best_auc_score_4 = 0 f_log = open("train_log.txt", "w") for epoch in range(self.epochs): model.train() optimizer.zero_grad() # 加载每个 batch 并训练 train_start_time = time.time() for i, batch_data in enumerate(train_loader): x_batch = batch_data[0] y_batch = batch_data[1] target_weight_batch = batch_data[2] aux_weight_batch = batch_data[3] identity_weight_batch = batch_data[4] np_weight_batch = batch_data[5] np_identity_weight_batch = batch_data[6] y_pred = model(x_batch.to(self.device), attention_mask=(x_batch > 0).to(self.device), labels=None) target_loss, aux_loss, identity_loss, np_loss = self.custom_loss( y_pred, y_batch, epoch, target_weight_batch, aux_weight_batch, identity_weight_batch, np_weight_batch) loss = target_loss + aux_loss + identity_loss + np_loss with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() # 验证 if (i + 1) % valid_every == 0: model.eval() stage = int((i + 1) / valid_every) train_stage_duration = int( (time.time() - train_start_time) / 60) valid_start_time = time.time() y_pred = np.zeros((len(self.train_df) - self.train_len)) for j, valid_batch_data in enumerate(valid_loader): x_batch = valid_batch_data[0] batch_y_pred = self.sigmoid( model(x_batch.to(self.device), attention_mask=(x_batch > 0).to(self.device), labels=None).detach().cpu().numpy())[:, 0] y_pred[j * self.base_batch_size:(j + 1) * self.base_batch_size] = batch_y_pred # 计算得分 auc_score = self.evaluator.get_final_metric(y_pred) valid_duration = int((time.time() - valid_start_time) / 60) train_start_time = time.time() f_log.write( "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f\n" % (epoch, stage, train_stage_duration, valid_duration, auc_score)) print( "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f" % (epoch, stage, train_stage_duration, valid_duration, auc_score)) if auc_score > best_auc_score_4: state_dict = model.state_dict() if auc_score > best_auc_score_1: best_auc_score_1 = auc_score torch.save(state_dict, "model1.bin") elif auc_score > best_auc_score_2: best_auc_score_2 = auc_score torch.save(state_dict, "model2.bin") elif auc_score > best_auc_score_3: best_auc_score_3 = auc_score torch.save(state_dict, "model3.bin") else: best_auc_score_4 = auc_score torch.save(state_dict, "model4.bin") with open("model_score.txt", "w") as f: f.write( "model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4)) print( "model1: %.4f model2: %.4f model3: %.4f model4: %.4f" % (best_auc_score_1, best_auc_score_2, best_auc_score_3, best_auc_score_4)) model.train() if self.last is True: state_dict = model.state_dict() torch.save(state_dict, "model_last.bin") # del 训练相关输入和模型 training_history = [ train_loader, valid_loader, model, optimizer, param_optimizer, optimizer_grouped_parameters ] for variable in training_history: del variable gc.collect()
model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) model=model.train() tq = tqdm_notebook(range(EPOCHS)) for epoch in tq: train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) avg_loss = 0. avg_accuracy = 0. lossf=None tk0 = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False) optimizer.zero_grad() # Bug fix - thanks to @chinhuic for i,(x_batch, y_batch) in tk0: # optimizer.zero_grad() y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None) loss = F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device)) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i+1) % accumulation_steps == 0: # Wait for several backward steps optimizer.step() # Now we can do an optimizer step optimizer.zero_grad() if lossf: lossf = 0.98*lossf+0.02*loss.item() else: lossf = loss.item() tk0.set_postfix(loss = lossf) avg_loss += loss.item() / len(train_loader) avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>0.5) == (y_batch[:,0]>0.5).to(device)).to(torch.float) ).item()/len(train_loader) tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy) torch.save(model.state_dict(), output_model_file)
# break batch = tuple(t.to(device) for t in batch) X, S, X_MASK, X_SEG = batch pred_s = subject_model(X, X_SEG, X_MASK) active_loss = X_MASK.view(-1) == 1 loss = loss_func( pred_s.view(-1, num_class)[active_loss], S.view(-1)[active_loss]) if n_gpu > 1: loss = loss.mean() loss.backward() optimizer.step() optimizer.zero_grad() tr_total_loss += loss.item() if batch_idx % 100 == 0: logger.info( f'Epoch:{epoch} - batch:{batch_idx}/{train_D.steps} - loss: {tr_total_loss / batch_idx:.8f}' ) subject_model.eval() A, B, C = 1e-10, 1e-10, 1e-10 err_dict = defaultdict(list) cat_dict = defaultdict(lambda: 1e-10) for eval_idx, d in enumerate(dev_data): tt, ll = d R = extract_items(tt)
class MTDNNModel(object): def __init__(self, opt, state_dict=None, num_train_step=-1): self.config = opt self.updates = state_dict[ 'updates'] if state_dict and 'updates' in state_dict else 0 self.train_loss = AverageMeter() self.network = SANBertNetwork(opt) if state_dict: new_state = set(self.network.state_dict().keys()) for k in list(state_dict['state'].keys()): if k not in new_state: del state_dict['state'][k] for k, v in list(self.network.state_dict().items()): if k not in state_dict['state']: state_dict['state'][k] = v self.network.load_state_dict(state_dict['state']) self.mnetwork = nn.DataParallel( self.network) if opt['multi_gpu_on'] else self.network self.total_param = sum([ p.nelement() for p in self.network.parameters() if p.requires_grad ]) no_decay = [ 'bias', 'gamma', 'beta', 'LayerNorm.bias', 'LayerNorm.weight' ] optimizer_parameters = [{ 'params': [ p for n, p in self.network.named_parameters() if n not in no_decay ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in self.network.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] # note that adamax are modified based on the BERT code if opt['optimizer'] == 'sgd': self.optimizer = optim.sgd(optimizer_parameters, opt['learning_rate'], weight_decay=opt['weight_decay']) elif opt['optimizer'] == 'adamax': self.optimizer = Adamax(optimizer_parameters, opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False elif opt['optimizer'] == 'adadelta': self.optimizer = optim.Adadelta(optimizer_parameters, opt['learning_rate'], rho=0.95) elif opt['optimizer'] == 'adam': self.optimizer = Adam(optimizer_parameters, lr=opt['learning_rate'], warmup=opt['warmup'], t_total=num_train_step, max_grad_norm=opt['grad_clipping'], schedule=opt['warmup_schedule']) if opt.get('have_lr_scheduler', False): opt['have_lr_scheduler'] = False else: raise RuntimeError('Unsupported optimizer: %s' % opt['optimizer']) if state_dict and 'optimizer' in state_dict: self.optimizer.load_state_dict(state_dict['optimizer']) if opt.get('have_lr_scheduler', False): if opt.get('scheduler_type', 'rop') == 'rop': self.scheduler = ReduceLROnPlateau(self.optimizer, mode='max', factor=opt['lr_gamma'], patience=3) elif opt.get('scheduler_type', 'rop') == 'exp': self.scheduler = ExponentialLR(self.optimizer, gamma=opt.get('lr_gamma', 0.95)) else: milestones = [ int(step) for step in opt.get('multi_step_lr', '10,20,30').split(',') ] self.scheduler = MultiStepLR(self.optimizer, milestones=milestones, gamma=opt.get('lr_gamma')) else: self.scheduler = None self.ema = None if opt['ema_opt'] > 0: self.ema = EMA(self.config['ema_gamma'], self.network) self.para_swapped = False def setup_ema(self): if self.config['ema_opt']: self.ema.setup() def update_ema(self): if self.config['ema_opt']: self.ema.update() def eval(self): if self.config['ema_opt']: self.ema.swap_parameters() self.para_swapped = True def train(self): if self.para_swapped: self.ema.swap_parameters() self.para_swapped = False def update(self, batch_meta, batch_data): self.network.train() labels = batch_data[batch_meta['label']] if batch_meta['pairwise']: labels = labels.contiguous().view(-1, batch_meta['pairwise_size'])[:, 0] if self.config['cuda']: y = Variable(labels.cuda(async=True), requires_grad=False) else: y = Variable(labels, requires_grad=False) task_id = batch_meta['task_id'] task_type = batch_meta['task_type'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) logits = self.mnetwork(*inputs) if batch_meta['pairwise']: logits = logits.view(-1, batch_meta['pairwise_size']) if self.config.get('weighted_on', False): if self.config['cuda']: weight = Variable( batch_data[batch_meta['factor']].cuda(async=True)) else: weight = Variable(batch_data[batch_meta['factor']]) if task_type > 0: loss = torch.mean( F.mse_loss(logits.squeeze(), y, reduce=False) * weight) else: loss = torch.mean( F.cross_entropy(logits, y, reduce=False) * weight) else: if task_type > 0: loss = F.mse_loss(logits.squeeze(), y) else: loss = F.cross_entropy(logits, y) self.train_loss.update(loss.item(), logits.size(0)) self.optimizer.zero_grad() loss.backward() if self.config['global_grad_clipping'] > 0: torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.config['global_grad_clipping']) self.optimizer.step() self.updates += 1 self.update_ema() def predict(self, batch_meta, batch_data): self.network.eval() task_id = batch_meta['task_id'] task_type = batch_meta['task_type'] inputs = batch_data[:batch_meta['input_len']] if len(inputs) == 3: inputs.append(None) inputs.append(None) inputs.append(task_id) score = self.mnetwork(*inputs) if batch_meta['pairwise']: score = score.contiguous().view(-1, batch_meta['pairwise_size']) if task_type < 1: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.zeros(score.shape, dtype=int) positive = np.argmax(score, axis=1) for idx, pos in enumerate(positive): predict[idx, pos] = 1 predict = predict.reshape(-1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['true_label'] else: if task_type < 1: score = F.softmax(score, dim=1) score = score.data.cpu() score = score.numpy() predict = np.argmax(score, axis=1).tolist() score = score.reshape(-1).tolist() return score, predict, batch_meta['label'] def save(self, filename): network_state = dict([(k, v.cpu()) for k, v in self.network.state_dict().items()]) ema_state = dict([ (k, v.cpu()) for k, v in self.ema.model.state_dict().items() ]) if self.ema is not None else dict() params = { 'state': network_state, 'optimizer': self.optimizer.state_dict(), 'ema': ema_state, 'config': self.config, } torch.save(params, filename) logger.info('model saved to {}'.format(filename)) def cuda(self): self.network.cuda() if self.config['ema_opt']: self.ema.cuda()
model.train() classifier.train() total_loss = 0 for i, batch in enumerate(train_dataloader): data, mask = tensorized(batch[:, 0], vocab) label = torch.tensor(list(batch[:, 1])).to(DEVICE) data, mask = data.to(DEVICE), mask.to(DEVICE) output = model(data, mask) logit, loss = classifier(output, label) loss = loss.mean() loss = loss.mean() / accumulation_steps loss.backward() if (i + 1) % accumulation_steps == 0: optim.step() c_optim.step() optim.zero_grad() c_optim.zero_grad() total_loss += loss.item() * accumulation_steps model.eval() classifier.eval() with torch.no_grad(): valid_loss = 0 preds, labels = [], [] for i, batch in enumerate(valid_dataloader): data, mask = tensorized(batch[:, 0], vocab) label = torch.tensor(list(batch[:, 1])).to(DEVICE) data, mask = data.to(DEVICE), mask.to(DEVICE)
def train_and_validate(model, train_loader, eval_loader, tokenizer, processor, max_eps, lr, batch_size, num_train_examples, warmup, print_every=10, use_bert_adam=False, log_training_info=True): torch.cuda.empty_cache() tr_loss = 0 nb_tr_steps = 1 criterion = nn.CrossEntropyLoss() if use_bert_adam: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = int((float(num_train_examples) / batch_size) * max_eps) opti = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=warmup, t_total=t_total) else: opti = optim.Adam(model.parameters(), lr=lr) categories = set(processor.categories.keys()) # training if log_training_info: print("***** Running training *****") print(f" Epochs = {max_eps}\n") print(f" Num examples = {num_train_examples}") print(f" Learning rate = {lr}") print(f" Batch size = {batch_size}") print( f" Categories = {categories if categories != CodahProcessor.get_all_categories() else 'all'}\n" ) model.train() for ep in range(max_eps): tr_loss = 0 for step, batch in enumerate(train_loader): # clear gradients model.zero_grad() # reshape and reduce the second dimension # pull label from training data, set aside for softmax n_batches = batch[0].shape[0] batch = [ids.view(ids.shape[0] * 4, -1) for ids in batch] # feedforward and loss calculation # batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, word_ids, word_lens, label_ids, _ = batch logits = model.forward( input_ids.cuda(), segment_ids.cuda(), input_mask.cuda(), word_ids.cuda(), word_lens.cuda( )) #, label_ids) label removed to skip softmax in model logits = logits.view(-1, 4) # reshape to (:, 4) loss = criterion(logits, label_ids.view(n_batches, 4)[:, 0].cuda()) loss.backward() tr_loss += loss.item() nb_tr_steps += 1 # optimization step opti.step() if (step + 1) % print_every == 0 and log_training_info: acc = accuracy(logits, label_ids.view(n_batches, 4)[:, 0].cuda()) print( "Iteration {} of epoch {} complete. Loss : {} Accuracy : {}" .format(step + 1, ep + 1, loss.item(), acc)) # evaluation # eval_examples = processor.get_dev_examples() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 eval_category_acc = {key: 0 for key in categories} nb_eval_category_steps = {key: 0 for key in categories} model.eval() for input_ids, input_mask, segment_ids, word_ids, word_lens, label_ids, category_ids in eval_loader: input_ids = input_ids.view(input_ids.shape[0] * 4, -1).cuda() input_mask = input_mask.view(input_mask.shape[0] * 4, -1).cuda() segment_ids = segment_ids.view(segment_ids.shape[0] * 4, -1).cuda() word_ids = word_ids.view(word_ids.shape[0] * 4, -1).cuda() word_lens = word_lens.view(word_lens.shape[0] * 4, -1).cuda() label_ids = label_ids[:, 0].cuda() category_ids = category_ids[:, 0] with torch.no_grad(): logits = model.forward(input_ids, segment_ids, input_mask, word_ids, word_lens).view(-1, 4) tmp_eval_loss = criterion(logits, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = np_accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy eval_category_acc[processor.id_to_category[ category_ids[0]]] += tmp_eval_accuracy nb_eval_category_steps[processor.id_to_category[category_ids[0]]] += 1 nb_eval_examples += label_ids.shape[0] nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'tr_loss': tr_loss / nb_tr_steps } print("\n***** Eval results *****") for key in sorted(result.keys()): print(f"{key} = {str(result[key])}") print("\nresults by question category") for key in categories: eval_category_acc[key] /= nb_eval_category_steps[key] print(f"{key} = {eval_category_acc[key]}") return result
def train(args): #label_name = ['not related or not informative', 'other useful information', 'donations and volunteering','affected individuals', 'sympathy and support', 'infrastructure and utilities damage','caution and advice'] label_name = ['Premise', 'Claim', 'None', 'MajorClaim'] device = torch.device("cuda:0" if args['--cuda'] else "cpu") prefix = args['MODEL'] + '_' + args['BERT_CONFIG'] bert_size = args['BERT_CONFIG'].split('-')[1] start_time = time.time() print('Importing data...', file=sys.stderr) df_train = pd.read_csv(args['--train'], index_col=0) df_val = pd.read_csv(args['--dev'], index_col=0) train_label = dict(df_train.InformationType_label.value_counts()) label_max = float(max(train_label.values())) train_label_weight = torch.tensor( [label_max / train_label[i] for i in range(len(train_label))], device=device) print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr) print('-' * 80, file=sys.stderr) start_time = time.time() print('Set up model...', file=sys.stderr) if args['MODEL'] == 'default': model = DefaultModel(args['BERT_CONFIG'], device, len(label_name)) optimizer = BertAdam([{ 'params': model.bert.bert.parameters() }, { 'params': model.bert.classifier.parameters(), 'lr': float(args['--lr']) }], lr=float(args['--lr-bert']), max_grad_norm=float(args['--clip-grad'])) elif args['MODEL'] == 'nonlinear': model = NonlinearModel(args['BERT_CONFIG'], device, len(label_name), float(args['--dropout'])) optimizer = BertAdam([{ 'params': model.bert.parameters() }, { 'params': model.linear1.parameters(), 'lr': float(args['--lr']) }, { 'params': model.linear2.parameters(), 'lr': float(args['--lr']) }, { 'params': model.linear3.parameters(), 'lr': float(args['--lr']) }], lr=float(args['--lr-bert']), max_grad_norm=float(args['--clip-grad'])) elif args['MODEL'] == 'lstm': model = CustomBertLSTMModel(args['BERT_CONFIG'], device, float(args['--dropout']), len(label_name), lstm_hidden_size=int( args['--hidden-size'])) optimizer = BertAdam([{ 'params': model.bert.parameters() }, { 'params': model.lstm.parameters(), 'lr': float(args['--lr']) }, { 'params': model.hidden_to_softmax.parameters(), 'lr': float(args['--lr']) }], lr=float(args['--lr-bert']), max_grad_norm=float(args['--clip-grad'])) elif args['MODEL'] == 'cnn': model = CustomBertConvModel(args['BERT_CONFIG'], device, float(args['--dropout']), len(label_name), out_channel=int(args['--out-channel'])) optimizer = BertAdam([{ 'params': model.bert.parameters() }, { 'params': model.conv.parameters(), 'lr': float(args['--lr']) }, { 'params': model.hidden_to_softmax.parameters(), 'lr': float(args['--lr']) }], lr=float(args['--lr-bert']), max_grad_norm=float(args['--clip-grad'])) else: print('please input a valid model') exit(0) model = model.to(device) print('Use device: %s' % device, file=sys.stderr) print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr) print('-' * 80, file=sys.stderr) model.train() cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight, reduction='mean') torch.save(cn_loss, 'loss_func') # for later testing train_batch_size = int(args['--batch-size']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = prefix + '_model.bin' num_trial = 0 train_iter = patience = cum_loss = report_loss = 0 cum_examples = report_examples = epoch = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('Begin Maximum Likelihood training...') while True: epoch += 1 for sents, targets in batch_iter(df_train, batch_size=train_batch_size, shuffle=True, bert=bert_size): # for each epoch train_iter += 1 optimizer.zero_grad() batch_size = len(sents) pre_softmax = model(sents) loss = cn_loss( pre_softmax, torch.tensor(targets, dtype=torch.long, device=device)) loss.backward() optimizer.step() batch_losses_val = loss.item() * batch_size report_loss += batch_losses_val cum_loss += batch_losses_val report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, ' 'cum. examples %d, speed %.2f examples/sec, ' 'time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, cum_examples, report_examples / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) train_time = time.time() report_loss = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, cum_examples), file=sys.stderr) cum_loss = cum_examples = 0. print('begin validation ...', file=sys.stderr) validation_loss = validation( model, df_val, bert_size, cn_loss, device) # dev batch size can be a bit larger print('validation: iter %d, loss %f' % (train_iter, validation_loss), file=sys.stderr) is_better = len( hist_valid_scores ) == 0 or validation_loss < min(hist_valid_scores) hist_valid_scores.append(validation_loss) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint print( 'load previously best model and decay learning rate to %f%%' % (float(args['--lr-decay']) * 100), file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] *= float(args['--lr-decay']) # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) exit(0)
def single_train(config): start_time = time.asctime(time.localtime(time.time())) print(start_time) x_trian = [] y_train = [] head_train = [] relations = [ "unknown", "Create", "Use", "Near", "Social", "Located", "Ownership", "General-Special", "Family", "Part-Whole" ] with open(config.train_path, 'r') as f: lines = f.readlines() for line in lines: # label = config.label_list.index(line.split(" ")[0].strip("__label__")) # sentence = "".join(line.lower().strip("\n").split(" ")[1:]) _, head, tail, label, sentence = line.split("\t") if (sentence): head_pos = (re.search('\[E11\]', sentence).span()[1], re.search('\[E12\]', sentence).span()[0]) # tail_pos = (re.search('\[E21\]', line).span()[1], re.search('\[E22\]', line).span()[0]) sentence = sentence.replace('[E21]', '') sentence = sentence.replace('[E22]', '') x_trian.append(sentence) y_train.append(relations.index(label)) head_train.append(head_pos) print('handle data over.') torch_dataset = ListDataset(x=x_trian, y=y_train, head=head_train) loader = Data.DataLoader( dataset=torch_dataset, batch_size=config.batch_size, # 批大小 shuffle=True, # 是否随机打乱顺序 num_workers=4, # 多线程读取数据的线程数 ) # model = Model(config).to(config.device) model = SingleModel(config).to(config.device) optimizer = BertAdam(model.parameters(), lr=config.lr, warmup=0.05, t_total=len(torch_dataset) * config.num_epoches) loss_func = torch.nn.CrossEntropyLoss() loss_li = [] print_loss = 0 for epoch in range(config.num_epoches): model.train() for step, (batch_texts, batch_span, batch_head_pos) in enumerate(loader): max_len = max([len(i) for i in batch_texts]) x = config.tokenizer.batch_encode_plus(batch_texts, add_special_tokens=True, return_tensors="pt", max_length=max_len, pad_to_max_length=True) x["input_ids"] = x["input_ids"].to(config.device) x["attention_mask"] = torch.abs( torch.ones(x["token_type_ids"].size(), dtype=torch.long).to( config.device) - x["token_type_ids"].to(config.device)) x["token_type_ids"] = x["token_type_ids"].to(config.device) out = model(input_ids=x["input_ids"], attention_mask=x["attention_mask"], token_type_ids=x["token_type_ids"], batch_head_pos=batch_head_pos) # print(loss) # print(torch.argmax(start[0]), torch.argmax(end[0])) optimizer.zero_grad() loss = loss_func(out, batch_span.to(config.device)).to(config.device) print_loss += loss loss.backward() optimizer.step() if (step + 1) % 10 == 0: print("epoch:", epoch, "step:", step, "loss", print_loss / 10) # print(config.tokenizer.decode(x["input_ids"][1])) # print(x["input_ids"][1]) # print(x["attention_mask"][1]) # print(x["token_type_ids"][1]) # print(batch_question_doc[0][0]) # print(torch.argmax(start[0]), torch.argmax(end[0])) # print(config.tokenizer.decode(x["input_ids"][0][torch.argmax(start[0]):torch.argmax(end[0])])) # print('real', batch_span[0][0], batch_span[1][0]) # print(config.tokenizer.decode(x["input_ids"][0][batch_span[0][0]: batch_span[1][0]])) loss_li.append(print_loss / 50) print_loss = 0 model_path = '/usr/tdq/models/re/aliBert-Sanwen-10' torch.save(model, model_path) end_time = time.asctime(time.localtime(time.time())) print("start time:{}, end time:{}".format(start_time, end_time)) return model_path