def train(args, train_dataset, model): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * 0.1, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) if args.fp16_opt_level == "O2": keep_batchnorm_fp32 = False else: keep_batchnorm_fp32 = True model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = DDP( model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size()) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs = 0 model.zero_grad() model.train() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Train(XX Epoch) Step(X/X) (loss=X.X)", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch outputs = model(input_ids, segment_ids, input_mask, start_positions, end_positions) loss = outputs # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule\ optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Train(%d Epoch) Step(%d / %d) (loss=%5.5f)" % (_, global_step, t_total, loss.item())) if args.local_rank in [-1, 0]: model_checkpoint = 'korquad_{0}_{1}_{2}_{3}.bin'.format( args.learning_rate, args.train_batch_size, epochs, int(args.num_train_epochs)) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) if args.n_gpu > 1 or args.local_rank != -1: logger.info("** ** * Saving file * ** **(module)") torch.save(model.module.state_dict(), output_model_file) else: logger.info("** ** * Saving file * ** **") torch.save(model.state_dict(), output_model_file) epochs += 1 logger.info("Training End!!!")
def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None, output_dir=None, max_seq_length=80, do_train=False, do_eval=False, do_lower_case=False, train_batch_size=24, eval_batch_size=8, learning_rate=2e-5, num_train_epochs=15, warmup_proportion=0.1,no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1, optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""): # ## Required parameters # parser.add_argument("--data_dir", # default=None, # type=str, # required=True, # help="The input data dir. Should contain the .tsv files (or other data files) for the task.") # parser.add_argument("--bert_model", default=None, type=str, required=True, # help="Bert pre-trained model selected in the list: bert-base-uncased, " # "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") # parser.add_argument("--task_name", # default=None, # type=str, # required=True, # help="The name of the task to train.") # parser.add_argument("--output_dir", # default=None, # type=str, # required=True, # help="The output directory where the model checkpoints will be written.") ## Other parameters # parser.add_argument("--max_seq_length", # default=128, # type=int, # help="The maximum total input sequence length after WordPiece tokenization. \n" # "Sequences longer than this will be truncated, and sequences shorter \n" # "than this will be padded.") # parser.add_argument("--do_train", # default=False, # action='store_true', # help="Whether to run training.") # parser.add_argument("--do_eval", # default=False, # action='store_true', # help="Whether to run eval on the dev set.") # parser.add_argument("--do_lower_case", # default=False, # action='store_true', # help="Set this flag if you are using an uncased model.") # parser.add_argument("--train_batch_size", # default=32, # type=int, # help="Total batch size for training.") # parser.add_argument("--eval_batch_size", # default=8, # type=int, # help="Total batch size for eval.") # parser.add_argument("--learning_rate", # default=5e-5, # type=float, # help="The initial learning rate for Adam.") # parser.add_argument("--num_train_epochs", # default=3.0, # type=float, # help="Total number of training epochs to perform.") # parser.add_argument("--warmup_proportion", # default=0.1, # type=float, # help="Proportion of training to perform linear learning rate warmup for. " # "E.g., 0.1 = 10%% of training.") # parser.add_argument("--no_cuda", # default=False, # action='store_true', # help="Whether not to use CUDA when available") # parser.add_argument("--local_rank", # type=int, # default=-1, # help="local_rank for distributed training on gpus") # parser.add_argument('--seed', # type=int, # default=42, # help="random seed for initialization") # parser.add_argument('--gradient_accumulation_steps', # type=int, # default=1, # help="Number of updates steps to accumulate before performing a backward/update pass.") # parser.add_argument('--optimize_on_cpu', # default=False, # action='store_true', # help="Whether to perform optimization and keep the optimizer averages on CPU") # parser.add_argument('--fp16', # default=False, # action='store_true', # help="Whether to use 16-bit float precision instead of 32-bit") # parser.add_argument('--loss_scale', # type=float, default=128, # help='Loss scaling, positive power of 2 values can improve fp16 convergence.') # args = parser.parse_args() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, "mrpc": MrpcProcessor, "stance":StanceProcessor, "neg":NegProcessor, "tri": TriProcessor } if local_rank == -1 or no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if fp16: logger.info("16-bits training currently not supported in distributed training") fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1)) if gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( gradient_accumulation_steps)) train_batch_size = int(train_batch_size / gradient_accumulation_steps) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) if not do_train and not do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if do_train: # if os.path.exists(output_dir) and os.listdir(output_dir): if os.path.exists(output_dir): pass # raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir)) else: os.makedirs(output_dir, exist_ok=True) task_name = task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() # tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') train_examples = None num_train_steps = None if do_train: train_df = processor.get_train_df(data_dir) test_df = processor.get_test_df(data_dir) dev_df = processor.get_dev_df(data_dir) new_train_df = generate_opp_pers_dataset(train_df) new_train_df.to_csv(os.path.join(data_dir, "tri_train.tsv"),sep='\t',index=False) new_test_df = generate_opp_pers_dataset_with_naive(test_df) new_test_df.to_csv(os.path.join(data_dir, "tri_test.tsv"),sep='\t',index=False) new_dev_df = generate_opp_pers_dataset_with_naive(dev_df) new_dev_df.to_csv(os.path.join(data_dir, "tri_dev.tsv"),sep='\t',index=False) train_examples = processor.get_train_examples(data_dir) num_train_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs) # Prepare model # model = BertForSequenceClassification.from_pretrained(bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2) model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2) model.to(device) if fp16: model.half() if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) for n, param in model.named_parameters()] elif optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] t_total = num_train_steps # print(t_total) if local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if do_train: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=t_total) global_step = 0 if do_train: claim_features = convert_claims_to_features(train_examples, label_list, max_seq_length, tokenizer) logger.info("claims features done") train_features = convert_pers_to_features(train_examples, label_list, max_seq_length, tokenizer) logger.info("perspective features done") # opposite_claim_features = convert_opp_claims_to_features(train_examples, label_list, max_seq_length, tokenizer) # logger.info("opposite claim features done") opposite_perspective_features = convert_triopp_pers_to_features(train_examples, label_list, max_seq_length, tokenizer) logger.info("opp perspective features done") logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_steps) pers_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) pers_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) pers_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) pers_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long) claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long) claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long) claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long) opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_perspective_features], dtype=torch.long) opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_perspective_features], dtype=torch.long) opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_perspective_features], dtype=torch.long) opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_perspective_features], dtype=torch.long) # opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_perspective_features if f.input_ids], dtype=torch.long) # opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_perspective_features if f.input_mask], dtype=torch.long) # opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_perspective_features if f.segment_ids], dtype=torch.long) # opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_perspective_features if f.label_id], dtype=torch.long) # opp_claims_input_ids = torch.tensor([f.input_ids for f in opposite_claim_features], dtype=torch.long) # opp_claims_input_mask = torch.tensor([f.input_mask for f in opposite_claim_features], dtype=torch.long) # opp_claims_segment_ids = torch.tensor([f.segment_ids for f in opposite_claim_features], dtype=torch.long) # opp_claims_label_ids = torch.tensor([f.label_id for f in opposite_claim_features], dtype=torch.long) # logger.info(" opp pers id: %d, opp pers mask: %d, opp pers seg: %d, opp pers label: %d, opp calims label: %d, calims label: %d ", len(opp_pers_input_ids),len(opp_pers_input_mask),len(opp_pers_segment_ids),len(opp_pers_label_ids),len(opp_claims_label_ids),len(claims_label_ids)) train_data = TensorDataset(pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids) if local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) model.train() for _ in trange(int(num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 process_bar = tqdm(train_dataloader) for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids = batch out_results = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids) # loss = model(input_ids, segment_ids, input_mask, label_ids) # print("out_results:") # print(out_results) loss = out_results if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if fp16 and loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * loss_scale if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps process_bar.set_description("Loss: %0.8f" % (loss.sum().item())) loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % gradient_accumulation_steps == 0: if fp16 or optimize_on_cpu: if fp16 and loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info("FP16 TRAINING: Nan in gradients, reducing loss scaling") loss_scale = loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 print("\nLoss: {}\n".format(tr_loss / nb_tr_steps)) torch.save(model.state_dict(), output_dir +"with_naive_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch15.pth") if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0): test_df = processor.get_test_df(data_dir) # new_test_df = generate_opp_dataset(test_df) # new_test_df.to_csv(os.path.join(data_dir, "new_test.tsv"),sep='\t',index=False) train_df = processor.get_train_df(data_dir) # new_train_df = generate_opp_dataset(train_df) # new_train_df.to_csv(os.path.join(data_dir, "new_train.tsv"),sep='\t',index=False) dev_df = processor.get_dev_df(data_dir) # new_dev_df = generate_opp_dataset(dev_df) # new_dev_df.to_csv(os.path.join(data_dir, "new_dev.tsv"),sep='\t',index=False) eval_examples = processor.get_test_examples(data_dir) # eval_examples = processor.get_train_examples(data_dir) # eval_examples = processor.get_dev_examples(data_dir) claim_features = convert_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer) eval_features = convert_pers_to_features(eval_examples, label_list, max_seq_length, tokenizer) # opposite_claim_features = convert_opp_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer) opposite_eval_features = convert_triopp_pers_to_features(eval_examples, label_list, max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", eval_batch_size) pers_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) pers_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) pers_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) pers_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long) claims_input_mask = torch.tensor([f.input_mask for f in claim_features], dtype=torch.long) claims_segment_ids = torch.tensor([f.segment_ids for f in claim_features], dtype=torch.long) claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long) opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_eval_features], dtype=torch.long) opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_eval_features], dtype=torch.long) opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_eval_features], dtype=torch.long) opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_eval_features], dtype=torch.long) # opp_claims_input_ids = torch.tensor([f.input_ids for f in opposite_claim_features], dtype=torch.long) # opp_claims_input_mask = torch.tensor([f.input_mask for f in opposite_claim_features], dtype=torch.long) # opp_claims_segment_ids = torch.tensor([f.segment_ids for f in opposite_claim_features], dtype=torch.long) # opp_claims_label_ids = torch.tensor([f.label_id for f in opposite_claim_features], dtype=torch.long) # logger.info("%d%d%d%d", len(pers_input_ids),len(claims_input_ids),len(opp_pers_input_ids),len(opp_claims_input_ids)) eval_data = TensorDataset(pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids) # logger.info(eval_data) # Run prediction for full data # eval_sampler = SequentialSampler(eval_data) eval_sampler = SequentialSampler(eval_data) # logger.info("1") eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) # print('all_input_ids:') # print(all_input_ids) # logger.info("2") # model.load_state_dict(torch.load(saved_model)) model_state_dict = torch.load(saved_model) # logger.info("3") model = BertForConsistencyCueClassification.from_pretrained('bert-base-uncased', num_labels=2, state_dict=model_state_dict) # logger.info("4") model.to(device) # logger.info("5") model.eval() # logger.info("6") # eval_loss, eval_accuracy = 0, 0 eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0 distance_eval_tp, distance_eval_pred_c, distance_eval_gold_c = 0, 0, 0 eval_loss, eval_accuracy, eval_macro_p, eval_macro_r = 0, 0, 0, 0 distance_accuracy, distance_eval_macro_p, distance_eval_macro_r = 0, 0, 0 raw_score = [] predicted_labels = [] distance_labels = [] predicted_prob = [] gold_labels = [] nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) claim_input_ids = claim_input_ids.to(device) claim_input_mask = claim_input_mask.to(device) claim_segment_ids = claim_segment_ids.to(device) claim_label_ids = claim_label_ids.to(device) opp_input_ids = opp_input_ids.to(device) opp_input_mask = opp_input_mask.to(device) opp_segment_ids = opp_segment_ids.to(device) opp_label_ids = opp_label_ids.to(device) # opp_claim_input_ids = opp_claim_input_ids.to(device) # opp_claim_input_mask = opp_claim_input_mask.to(device) # opp_claim_segment_ids = opp_claim_segment_ids.to(device) # opp_claim_label_ids = opp_claim_label_ids.to(device) # print("start") # print(input_ids) # print(input_mask) # print(segment_ids) # print(label_ids) # print(claim_input_ids) # print(claim_input_mask) # print(claim_segment_ids) # print(claim_label_ids) # print("end") with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask)[0] distance_logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask)[1] # predicted_prob.extend(torch.nn.functional.softmax(logits, dim=1)) # logits_grid = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, input_ids4=opp_claim_input_ids, token_type_ids4=opp_claim_segment_ids, attention_mask4=opp_claim_input_mask) # print(logits) # print(logits[0]) logits = logits.detach().cpu().numpy() distance_logits = distance_logits.detach().cpu().numpy() # print(logits) label_ids = label_ids.to('cpu').numpy() # print(label_ids) tmp_eval_accuracy = accuracy(logits, label_ids) distance_eval_accuracy = accuracy(distance_logits, label_ids) tmp_predicted = np.argmax(logits, axis=1) distance_predicted = np.argmax(distance_logits, axis=1) predicted_labels.extend(tmp_predicted.tolist()) distance_labels.extend(distance_predicted.tolist()) gold_labels.extend(label_ids.tolist()) # Micro F1 (aggregated tp, fp, fn counts across all examples) tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount(logits, label_ids) eval_tp += tmp_tp eval_pred_c += tmp_pred_c eval_gold_c += tmp_gold_c distance_tp, distance_pred_c, distance_gold_c = tp_pcount_gcount(distance_logits, label_ids) distance_eval_tp += distance_tp distance_eval_pred_c += distance_pred_c distance_eval_gold_c += distance_gold_c pred_label = np.argmax(logits, axis=1) distance_label = np.argmax(distance_logits, axis=1) raw_score += zip(logits, distance_logits, pred_label, distance_label, label_ids) # Macro F1 (averaged P, R across mini batches) tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids) eval_macro_p += tmp_eval_p eval_macro_r += tmp_eval_r distance_eval_p, distance_eval_r, distance_eval_f1 = p_r_f1(distance_logits, label_ids) distance_eval_macro_p += distance_eval_p distance_eval_macro_r += distance_eval_r eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy distance_accuracy += distance_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 # Micro F1 (aggregated tp, fp, fn counts across all examples) eval_micro_p = eval_tp / eval_pred_c eval_micro_r = eval_tp / eval_gold_c eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p + eval_micro_r) distance_eval_micro_p = distance_eval_tp / distance_eval_pred_c distance_eval_micro_r = distance_eval_tp / distance_eval_gold_c distance_eval_micro_f1 = 2 * distance_eval_micro_p * distance_eval_micro_r / (distance_eval_micro_p + distance_eval_micro_r) # Macro F1 (averaged P, R across mini batches) eval_macro_p = eval_macro_p / nb_eval_steps eval_macro_r = eval_macro_r / nb_eval_steps eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p + eval_macro_r) distance_eval_macro_p = distance_eval_macro_p / nb_eval_steps distance_eval_macro_r = distance_eval_macro_r / nb_eval_steps distance_eval_macro_f1 = 2 * distance_eval_macro_p * distance_eval_macro_r / (distance_eval_macro_p + distance_eval_macro_r) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples distance_accuracy = distance_accuracy / nb_eval_examples # print("\nLoss: {}\n".format(eval_loss / nb_eval_steps)) result = { 'eval_loss': eval_loss, 'eval_accuracy':eval_accuracy, 'eval_micro_p': eval_micro_p, 'eval_micro_r': eval_micro_r, 'eval_micro_f1': eval_micro_f1, 'eval_macro_p': eval_macro_p, 'eval_macro_r': eval_macro_r, 'eval_macro_f1': eval_macro_f1, 'distance_accuracy':distance_accuracy, 'distance_eval_micro_p': distance_eval_micro_p, 'distance_eval_micro_r': distance_eval_micro_r, 'distance_eval_micro_f1': distance_eval_micro_f1, 'distance_eval_macro_p': distance_eval_macro_p, 'distance_eval_macro_r': distance_eval_macro_r, 'distance_eval_macro_f1': distance_eval_macro_f1 # 'global_step': global_step, # 'loss': tr_loss/nb_tr_steps } output_eval_file = os.path.join(output_dir,"time_pers_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch25_eval_results.txt") output_raw_score = os.path.join(output_dir,"time_pers_distance_concat_margin1_costriplet_cos_siamese_bs24_lr2e_5_epoch25_raw_score.csv") # logger.info(classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4)) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # writer.write(classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4)) with open(output_raw_score, 'w') as fout: fields = ["undermine_score", "support_score", "cp_distance", "cop_distance", "predict_label", "distance_label", "gold"] writer = csv.DictWriter(fout, fieldnames=fields) writer.writeheader() for score, distance, pred, distance_pred, gold in raw_score: writer.writerow({ "undermine_score": str(score[0]), "support_score": str(score[1]), "cp_distance": str(distance[0]), "cop_distance": str(distance[1]), "predict_label": str(pred), "distance_label": str(distance_pred), "gold": str(gold) })
def train(args, train_dataset, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter("./runs2/distilbert_align/") args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs warm_up_steps = int(args.warmup_steps * t_total) save_steps = int(args.save_steps * t_total) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] a = [] b = [] c = [] d = [] optimizer_grouped_parameters = [] for n, p in model.named_parameters(): if 'classifier' in n or 'linear_r' in n or 'linear_g' in n: if any(nd in n for nd in no_decay): a.append(p) else: b.append(p) else: if any(nd in n for nd in no_decay): c.append(p) else: d.append(p) optimizer_grouped_parameters.append({ "params": a, "weight_decay": 0, "lr": 2e-3 }) optimizer_grouped_parameters.append({ "params": b, "weight_decay": args.weight_decay, "lr": 2e-3 }) optimizer_grouped_parameters.append({"params": c, "weight_decay": 0}) optimizer_grouped_parameters.append({ "params": d, "weight_decay": args.weight_decay }) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warm_up_steps, t_total=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'align_mask': batch[2], 'labels': batch[4] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar('lr_n', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('lr_o', scheduler.get_lr()[2], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if save_steps > 0 and global_step % save_steps == 0: # Save model checkpoint if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
def provider( data_folder, df_path, phase, fold=0, mean=None, std=None, batch_size=8, num_workers=4, ): '''Returns dataloader for the model training''' df = pd.read_csv(df_path) # some preprocessing # https://www.kaggle.com/amanooo/defect-detection-starter-u-net df['ImageId'], df['ClassId'] = zip(*df['ImageId_ClassId'].str.split('_')) df['ClassId'] = df['ClassId'].astype(int) df = df.pivot(index='ImageId', columns='ClassId', values='EncodedPixels') df['defects'] = df.count(axis=1) #train_df, val_df = train_test_split(df, test_size=0.1, stratify=df["defects"]) #TODO:split: #>>>>>>>>>> num_data = df.shape[0] df['group'] = np.random.randint(5, size=num_data) train_df = df[df['group'] != fold] val_df = df[df['group'] == fold] # # added by zhang ge :when training, delete rows that have no RLE data df = train_df if phase == "train" else val_df #[train_df['defects']>0] image_dataset = TrainvalDataset(df, data_folder, mean, std, phase) #define sampler:weights of each class:[2.2,14,52,2.5,16] resample = False #<<<<<<<<<<<<<<<<<<<<<<< TODO: SET RESAMPLE if phase == 'train' and resample: print("Defining Sampler......") class_weights = torch.Tensor([2.2, 14, 52, 2.5, 16]).cuda() #to gpu #class_weights = torch.sqrt(class_weights) print(class_weights) size = len(image_dataset) sample_targets = torch.zeros(size, dtype=torch.int64).cuda() # to gpu for idx in tqdm(range(size), total=size): _, mask = make_mask(idx, df) #[256,1600,4] sum_cls = mask.sum(0).sum(0) if (sum_cls.sum() == 0): #bg sample_targets[idx] = 0 else: #fg #select order: 2,4,1,3->sum_cls:1,3,0,2 if sum_cls[1] != 0: sample_targets[idx] = 2 elif sum_cls[3] != 0: sample_targets[idx] = 4 elif sum_cls[0] != 0: sample_targets[idx] = 1 elif sum_cls[2] != 0: sample_targets[idx] = 3 sample_weights = class_weights[sample_targets] assert sample_weights.shape[0] == size sampler = WeightedRandomSampler(weights=sample_weights, num_samples=size) elif phase == 'train' and not resample: sampler = RandomSampler(df) else: #phase != 'train' sampler = SequentialSampler(df) dataloader = DataLoader( image_dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, pin_memory=True, shuffle=False, ) return dataloader
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--output", default=None, type=str, required=True, help="Path to processed datasets and export files") parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default="ner", type=str, help="The name of the task to train.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_false', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() args.data_dir = os.path.join(args.output, 'nerdata') args.output_dir = os.path.join(args.output, 'nermodel') if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: output_config_file = os.path.join(args.output_dir, CONFIG_NAME) output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, mask in enumerate(input_mask): temp_1 = [] temp_2 = [] for j, m in enumerate(mask): if j == 0: continue if m: if label_map[label_ids[i][j]] != "X": temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) else: temp_1.pop() temp_2.pop() break y_true.append(temp_1) y_pred.append(temp_2) report = classification_report(y_true, y_pred, digits=4) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def main(config): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_dataset = CNN_LSTM_Triplet_Dataset(config) print(len(train_dataset)) train_sampler = RandomSampler(train_dataset) train_loader = DataLoader(train_dataset, batch_size=config["model"]["batch_size"], sampler=train_sampler) model = CNN_LSTM(config).double().to(device=device) # if True: # model.load_state_dict(torch.load("./best_model_eval/model_acc_83.3276.pth")) # optimizer = torch.optim.Adam(lr=1e-5, betas=(0.9, 0.98), eps=1e-9) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = torch.optim.Adam(lr=1e-2, betas=(0.9, 0.98), eps=1e-9, params=optimizer_grouped_parameters) loss_fn = nn.TripletMarginLoss() acc_dev_previous = 0 loss_dev_previous = 1000 loss_train = 1000 for epoch in range(1, config["model"]["epoch"]): print(f"Training epoch {str(epoch)}") loss_train = train(train_loader, model, optimizer, loss_fn, loss_train, epoch, device) print(f"Evaluate model.............") acc_dev, loss_dev = evaluate(dev_loader, model, loss_fn, epoch, len(dev_dataset), device) print(f"Accuracy score: {acc_dev:.4f} at epoch {epoch}") print(f"Loss Dev: {loss_dev:.4f} at epoch {epoch}") print("=" * 15, f"END EPOCH {epoch}", "=" * 15) if acc_dev > acc_dev_previous: acc_dev_previous = acc_dev torch.save(model.state_dict(), f"./best_model_eval/model_acc_{round(acc_dev, 4)}.pth") if loss_dev < loss_dev_previous: loss_dev_previous = loss_dev torch.save( model.state_dict(), f"./best_model_eval/model_loss_{round(int(loss_dev), 4)}.pth")
def probe(args: Namespace, probing_model: BaseDecoder, tokenizer: AutoTokenizer, dataset_args, layer: int, vocab): write_to_execution_log(100 * '+' + '\t Probing layer: {} \t'.format(layer) + 100 * '+', append_newlines=True, path=args.execution_log) print( '################################## Layer: {} #########################################' .format(layer)) layer_data = {} google_re_metrices = [] trex_metrices = [] my_collate = functools.partial(probing_model.cloze_collate, tokenizer=tokenizer) print( '$$$$$$$$$$$$$$$$$$$$$$$ Probing model of type: {} $$$$$$$$$$$$$$$$$$$$$$$$' .format(args.model_type)) for ele in dataset_args: ds_name, relation_args_list = ele layer_data[ds_name] = {} layer_data[ds_name]['means'] = [] print( '***************** {} **********************'.format(ds_name)) for relation_args in relation_args_list: relation_args = DotMap(relation_args) args.relation_args = relation_args print('---------------- {} ----------------------'.format( args.relation_args.template)) print(stringify_dotmap(args.relation_args)) layer_data[ds_name][args.relation_args.relation] = [] dataset = ClozeDataset(probing_model, tokenizer, args, vocab, tokenizer.model_max_length, output_debug_info=False) # Create dataloader sampler = RandomSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.probing_batch_size, collate_fn=my_collate) metrics_elements = [] for _, batch in enumerate(tqdm(dataloader)): metrics_elements_from_batch = probing_model.probe( batch, layer=layer, relation_args=relation_args) metrics_elements.extend(metrics_elements_from_batch) gc.collect() print('Number metrics elements: {}'.format(len(metrics_elements))) aggregated_metrics = aggregate_metrics_elements(metrics_elements) print('Aggregated: {}'.format(aggregated_metrics['P_AT_1'])) if ds_name == 'Google_RE': google_re_metrices.append(aggregated_metrics) elif ds_name == 'TREx': trex_metrices.append(aggregated_metrics) layer_data[ds_name][args.relation_args.relation].append( aggregated_metrics) write_to_execution_log(ds_name + ': ' + args.relation_args.relation + '\t' + str(aggregated_metrics['P_AT_1']), append_newlines=True, path=args.execution_log) # Write results to logfile if len(google_re_metrices) > 0: write_to_execution_log('\n\nGoogle_RE: ' + mean_precisions(google_re_metrices), append_newlines=True, path=args.execution_log) layer_data['Google_RE']['means'].append( mean_precisions(google_re_metrices)) if len(trex_metrices) > 0: write_to_execution_log('Trex: ' + mean_precisions(trex_metrices), append_newlines=True, path=args.execution_log) layer_data['TREx']['means'].append(mean_precisions(trex_metrices)) write_to_execution_log(220 * '-', append_newlines=True, path=args.execution_log) if args.use_wandb_logging: wandb.init(name=args.wandb_run_name, project=args.wandb_project_name, settings=wandb.Settings(start_method='thread')) wandb_log_metrics(layer_data, layer) return layer_data
def run(datasets, seed=42): ## META VARIABLES random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False epochs = 20 batch_size = 32 X_train, mask_train, y_train = load_data(datasets['train'], num_sample=int(625*0.7)) X_val, mask_val, y_val = load_data(datasets['val'], num_sample=int(625*0.1)) X_test, mask_test, y_test = load_data(datasets['test'], num_sample=int(625*0.2)) train_dataset = TensorDataset(X_train, mask_train, y_train) val_dataset = TensorDataset(X_val, mask_val, y_val) test_dataset = TensorDataset(X_test, mask_test, y_test) # Dataloading train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size) validation_dataloader = DataLoader(val_dataset, sampler=RandomSampler(val_dataset), batch_size=batch_size) prediction_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=prediction_sampler, batch_size=batch_size) model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels = 3).to(device) optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs) total_t0 = time.time() best_state_dict = None best_val = 0 for epoch_i in range(epochs): # ======================================== # Training # ======================================== print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() total_train_loss = 0 model.train() predictions_train = np.array([]) true_label_train = np.array([]) for step, batch in enumerate(train_dataloader): # Progress update every 50 batches. if step % 50 == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) model.zero_grad() loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) total_train_loss += loss.item() * b_labels.shape[0] loss.backward() optimizer.step() scheduler.step() gc.collect() logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() predictions_train = np.append(predictions_train, np.argmax(logits, axis=1).flatten()) true_label_train = np.append(true_label_train, label_ids) # Calculate the average loss over all of the batches. accuracy_train = np.sum(predictions_train == true_label_train) / true_label_train.shape[0] f1_macro_train = f1_score(true_label_train, predictions_train, average='macro') f1_micro_train = f1_score(true_label_train, predictions_train, average='micro') print("\n Training Accuracy: {0:.2f}".format(accuracy_train)) print(" Training F1-MACRO: {0:.2f}".format(f1_macro_train)) print(" Training F1-MICRO: {0:.2f}".format(f1_micro_train)) avg_train_loss = total_train_loss / true_label_train.shape[0] training_time = format_time(time.time() - t0) print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epcoh took: {:}".format(training_time)) # ======================================== # Validation # ======================================== print("\nRunning Validation...") t0 = time.time() model.eval() total_val_loss = 0 predictions_val = np.array([]) true_label_val = np.array([]) for batch in validation_dataloader: b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) with torch.no_grad(): (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) total_val_loss += loss.item() * b_labels.shape[0] logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() predictions_val = np.append(predictions_val, np.argmax(logits, axis=1).flatten()) true_label_val = np.append(true_label_val, label_ids) accuracy_val = np.sum(predictions_val == true_label_val) / true_label_val.shape[0] f1_macro_val = f1_score(true_label_val, predictions_val, average='macro') f1_micro_val = f1_score(true_label_val, predictions_val, average='micro') print(" Accuracy: {0:.2f}".format(accuracy_val)) print(" F1-MACRO: {0:.2f}".format(f1_macro_val)) print(" F1-MICRO: {0:.2f}".format(f1_micro_val)) performance_metric = f1_macro_val if performance_metric > best_val: print("Best Model Updated.") best_val = performance_metric best_state_dict = model.state_dict() avg_val_loss = total_val_loss / true_label_val.shape[0] validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) print("\nTraining complete!") print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0))) # ======================================== # Test # ======================================== model.load_state_dict(best_state_dict) model.eval() predictions_test = np.array([]) true_label_test = np.array([]) for batch in test_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() predictions_test = np.append(predictions_test, np.argmax(logits, axis=1).flatten()) true_label_test = np.append(true_label_test, label_ids) best_accr = np.sum(predictions_test == true_label_test) / true_label_test.shape[0] best_macro_f1 = f1_score(true_label_test, predictions_test, average='macro') best_micro_f1 = f1_score(true_label_test, predictions_test, average='micro') best_confusion_matrix = confusion_matrix(true_label_test, predictions_test) print(" Test Accuracy: {0:.2f}".format(best_accr)) print(" Test F1-MACRO: {0:.2f}".format(best_macro_f1)) print(" Test F1-MICRO: {0:.2f}".format(best_micro_f1)) # ======================================== # Dummy Test # ======================================== X_train = X_train.detach().cpu().numpy() X_test = X_test.detach().cpu().numpy() y_train = y_train.detach().cpu().numpy().squeeze(1) y_test = y_test.detach().cpu().numpy().squeeze(1) dummy_clf = DummyClassifier(strategy="uniform") dummy_clf.fit(X_train, y_train) predictions_dummy = dummy_clf.predict(X_test) dummy_accr = np.sum(predictions_dummy == y_test) / y_test.shape[0] dummy_macro_f1 = f1_score(y_test, predictions_dummy, average='macro') dummy_micro_f1 = f1_score(y_test, predictions_dummy, average='micro') print(" Dummy Accuracy: {0:.2f}".format(dummy_accr)) print(" Dummy F1-MACRO: {0:.2f}".format(dummy_macro_f1)) print(" Dummy F1-MICRO: {0:.2f}".format(dummy_micro_f1)) # torch.save(best_state_dict, './output/best_model_kaushik_sample.pt') return { 'seed': seed, 'best_accr': best_accr, 'best_macro_f1': best_macro_f1, 'best_micro_f1': best_micro_f1, 'dummy_accr': dummy_accr, 'dummy_macro_f1': dummy_macro_f1, 'dummy_micro_f1': dummy_micro_f1 }
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples): model.eval() examples_src, examples_tgt, examples_srctgt, examples_tgtsrc, langid_srctgt, langid_tgtsrc, psi_examples_srctgt, psi_labels = [], [], [], [], [], [], [], [] src_len = tgt_len = 0 bpe2word_map_src, bpe2word_map_tgt = [], [] for example in examples: end_id = example[0][0][-1].view(-1) src_id = example[0][0][:args.block_size] src_id = torch.cat([src_id[:-1], end_id]) tgt_id = example[1][0][:args.block_size] tgt_id = torch.cat([tgt_id[:-1], end_id]) half_block_size = int(args.block_size / 2) half_src_id = example[0][0][:half_block_size] half_src_id = torch.cat([half_src_id[:-1], end_id]) half_tgt_id = example[1][0][:half_block_size] half_tgt_id = torch.cat([half_tgt_id[:-1], end_id]) examples_src.append(src_id) examples_tgt.append(tgt_id) src_len = max(src_len, len(src_id)) tgt_len = max(tgt_len, len(tgt_id)) srctgt = torch.cat([half_src_id, half_tgt_id]) langid = torch.cat([ torch.ones_like(half_src_id), torch.ones_like(half_tgt_id) * 2 ]) examples_srctgt.append(srctgt) langid_srctgt.append(langid) tgtsrc = torch.cat([half_tgt_id, half_src_id]) langid = torch.cat([ torch.ones_like(half_tgt_id), torch.ones_like(half_src_id) * 2 ]) examples_tgtsrc.append(tgtsrc) langid_tgtsrc.append(langid) # [neg, neg] pair neg_half_src_id = example[-2][0][:half_block_size] neg_half_src_id = torch.cat([neg_half_src_id[:-1], end_id]) neg_half_tgt_id = example[-1][0][:half_block_size] neg_half_tgt_id = torch.cat([neg_half_tgt_id[:-1], end_id]) if random.random() > 0.5: neg_srctgt = torch.cat([neg_half_src_id, neg_half_tgt_id]) else: neg_srctgt = torch.cat([neg_half_tgt_id, neg_half_src_id]) psi_examples_srctgt.append(neg_srctgt) psi_labels.append(1) # [pos, neg] pair rd = random.random() if rd > 0.75: neg_srctgt = torch.cat([half_src_id, neg_half_tgt_id]) elif rd > 0.5: neg_srctgt = torch.cat([neg_half_src_id, half_tgt_id]) elif rd > 0.25: neg_srctgt = torch.cat([half_tgt_id, neg_half_src_id]) else: neg_srctgt = torch.cat([neg_half_tgt_id, half_src_id]) psi_examples_srctgt.append(neg_srctgt) psi_labels.append(0) bpe2word_map_src.append(example[2]) bpe2word_map_tgt.append(example[3]) examples_src = pad_sequence(examples_src, batch_first=True, padding_value=tokenizer.pad_token_id) examples_tgt = pad_sequence(examples_tgt, batch_first=True, padding_value=tokenizer.pad_token_id) examples_srctgt = pad_sequence(examples_srctgt, batch_first=True, padding_value=tokenizer.pad_token_id) langid_srctgt = pad_sequence(langid_srctgt, batch_first=True, padding_value=tokenizer.pad_token_id) examples_tgtsrc = pad_sequence(examples_tgtsrc, batch_first=True, padding_value=tokenizer.pad_token_id) langid_tgtsrc = pad_sequence(langid_tgtsrc, batch_first=True, padding_value=tokenizer.pad_token_id) psi_examples_srctgt = pad_sequence( psi_examples_srctgt, batch_first=True, padding_value=tokenizer.pad_token_id) psi_labels = torch.tensor(psi_labels) guides = model.get_aligned_word( examples_src, examples_tgt, bpe2word_map_src, bpe2word_map_tgt, args.device, src_len, tgt_len, align_layer=args.align_layer, extraction=args.extraction, softmax_threshold=args.softmax_threshold) return examples_src, examples_tgt, guides, examples_srctgt, langid_srctgt, examples_tgtsrc, langid_tgtsrc, psi_examples_srctgt, psi_labels train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.max_steps > 0 and args.max_steps < t_total: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if (not (any(nd in n for nd in no_decay))) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if ((any(nd in n for nd in no_decay))) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 # Check if continuing training from a checkpoint tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() set_seed(args) # Added here for reproducibility def backward_loss(loss, tot_loss): if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tot_loss += loss.item() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() return tot_loss tqdm_iterator = trange(int(t_total), desc="Iteration", disable=args.local_rank not in [-1, 0]) for _ in range(int(args.num_train_epochs)): for step, batch in enumerate(train_dataloader): model.train() if args.train_so or args.train_co: inputs_src, inputs_tgt = batch[0].clone(), batch[1].clone() inputs_src, inputs_tgt = inputs_src.to( args.device), inputs_tgt.to(args.device) attention_mask_src, attention_mask_tgt = (inputs_src != 0), (inputs_tgt != 0) guide = batch[2].to(args.device) loss = model(inputs_src=inputs_src, inputs_tgt=inputs_tgt, attention_mask_src=attention_mask_src, attention_mask_tgt=attention_mask_tgt, guide=guide, align_layer=args.align_layer, extraction=args.extraction, softmax_threshold=args.softmax_threshold, train_so=args.train_so, train_co=args.train_co) tr_loss = backward_loss(loss, tr_loss) if args.train_mlm: inputs_src, labels_src = mask_tokens(batch[0], tokenizer, args) inputs_tgt, labels_tgt = mask_tokens(batch[1], tokenizer, args) inputs_src, inputs_tgt = inputs_src.to( args.device), inputs_tgt.to(args.device) labels_src, labels_tgt = labels_src.to( args.device), labels_tgt.to(args.device) loss = model(inputs_src=inputs_src, labels_src=labels_src) tr_loss = backward_loss(loss, tr_loss) loss = model(inputs_src=inputs_tgt, labels_src=labels_tgt) tr_loss = backward_loss(loss, tr_loss) if args.train_tlm: rand_ids = [0, 1] if not args.train_tlm_full: rand_ids = [int(random.random() > 0.5)] for rand_id in rand_ids: select_srctgt = batch[int(3 + rand_id * 2)] select_langid = batch[int(4 + rand_id * 2)] for lang_id in [1, 2]: inputs_srctgt, labels_srctgt = mask_tokens( select_srctgt, tokenizer, args, select_langid, lang_id) inputs_srctgt, labels_srctgt = inputs_srctgt.to( args.device), labels_srctgt.to(args.device) loss = model(inputs_src=inputs_srctgt, labels_src=labels_srctgt) tr_loss = backward_loss(loss, tr_loss) if args.train_psi: loss = model(inputs_src=batch[7].to(args.device), labels_psi=batch[8].to(args.device), align_layer=args.align_layer + 1) tr_loss = backward_loss(loss, tr_loss) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 tqdm_iterator.update() if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logger.info( " Step %s. Training loss = %s", str(global_step), str((tr_loss - logging_loss) / args.logging_steps)) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if global_step > t_total: break if global_step > t_total: break return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--model_name", default='GBert-predict', type=str, required=False, help="model name") parser.add_argument("--data_dir", default='./data/data_v1', type=str, required=False, help="The input data dir.") parser.add_argument("--pretrain_dir", default='./saved/GBert-pretraining', type=str, required=False, help="pretraining model") parser.add_argument("--train_file", default='data-multi-visit.pkl', type=str, required=False, help="training data file.") parser.add_argument( "--output_dir", default='./saved/', type=str, required=False, help="The output directory where the model checkpoints will be written." ) # Other parameters parser.add_argument("--use_pretrain", default=True, action='store_true', help="is use pretrain") parser.add_argument("--graph", default=True, action='store_true', help="if use ontology embedding") parser.add_argument("--therhold", default=0.25, type=float, help="therhold.") parser.add_argument( "--max_seq_length", default=55, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=True, action='store_true', help="Whether to run on the dev set.") # parser.add_argument("--do_test", # default=True, # action='store_true', # help="Whether to run on the test set.") parser.add_argument("--train_batch_size", default=1, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=50.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=1203, help="random seed for initialization") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") args = parser.parse_args() args.output_dir = os.path.join(args.output_dir, args.model_name) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) device = torch.device( "cuda:1" if torch.cuda.is_available() and not args.no_cuda else "cpu") if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError( # "Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) print("Loading Dataset") tokenizer, (train_dataset, eval_dataset) = load_dataset(args) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=1) eval_dataloader = DataLoader(eval_dataset, sampler=SequentialSampler(eval_dataset), batch_size=1) print('Loading Model: ' + args.model_name) # config = BertConfig(vocab_size_or_config_json_file=len(tokenizer.vocab.word2idx), side_len=train_dataset.side_len) # config.graph = args.graph # model = SeperateBertTransModel(config, tokenizer.dx_voc, tokenizer.rx_voc) if args.use_pretrain: logger.info("Use Pretraining model") model = GBERT_Predict.from_pretrained(args.pretrain_dir, tokenizer=tokenizer) else: config = BertConfig( vocab_size_or_config_json_file=len(tokenizer.vocab.word2idx)) config.graph = args.graph model = GBERT_Predict(config, tokenizer) logger.info('# of model parameters: ' + str(get_n_params(model))) model.to(device) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self rx_output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # Prepare optimizer # num_train_optimization_steps = int( # len(train_dataset) / args.train_batch_size) * args.num_train_epochs # param_optimizer = list(model.named_parameters()) # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any( # nd in n for nd in no_decay)], 'weight_decay': 0.01}, # {'params': [p for n, p in param_optimizer if any( # nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) optimizer = Adam(model.parameters(), lr=args.learning_rate) global_step = 0 if args.do_train: writer = SummaryWriter(args.output_dir) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", 1) dx_acc_best, rx_acc_best = 0, 0 acc_name = 'prauc' dx_history = {'prauc': []} rx_history = {'prauc': []} for epoch in trange(int(args.num_train_epochs), desc="Epoch"): print('') tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 prog_iter = tqdm(train_dataloader, leave=False, desc='Training') model.train() for _, batch in enumerate(prog_iter): batch = tuple(t.to(device) for t in batch) input_ids, dx_labels, rx_labels = batch input_ids, dx_labels, rx_labels = input_ids.squeeze( dim=0), dx_labels.squeeze(dim=0), rx_labels.squeeze(dim=0) loss, rx_logits = model(input_ids, dx_labels=dx_labels, rx_labels=rx_labels, epoch=global_step) loss.backward() tr_loss += loss.item() nb_tr_examples += 1 nb_tr_steps += 1 # Display loss prog_iter.set_postfix(loss='%.4f' % (tr_loss / nb_tr_steps)) optimizer.step() optimizer.zero_grad() writer.add_scalar('train/loss', tr_loss / nb_tr_steps, global_step) global_step += 1 if args.do_eval: print('') logger.info("***** Running eval *****") model.eval() dx_y_preds = [] dx_y_trues = [] rx_y_preds = [] rx_y_trues = [] for eval_input in tqdm(eval_dataloader, desc="Evaluating"): eval_input = tuple(t.to(device) for t in eval_input) input_ids, dx_labels, rx_labels = eval_input input_ids, dx_labels, rx_labels = input_ids.squeeze( dim=0), dx_labels.squeeze(dim=0), rx_labels.squeeze( dim=0) with torch.no_grad(): loss, rx_logits = model(input_ids, dx_labels=dx_labels, rx_labels=rx_labels) rx_y_preds.append(t2n(torch.sigmoid(rx_logits))) rx_y_trues.append(t2n(dx_labels)) # rx_y_preds.append(t2n(torch.sigmoid(rx_logits))) # rx_y_trues.append(t2n(rx_labels)) # dx_y_preds.append(t2n(torch.sigmoid(dx_logits))) # dx_y_trues.append( # t2n(dx_labels.view(-1, len(tokenizer.dx_voc.word2idx)))) # rx_y_preds.append(t2n(torch.sigmoid(rx_logits))[ # :, tokenizer.rx_singe2multi]) # rx_y_trues.append( # t2n(rx_labels)[:, tokenizer.rx_singe2multi]) print('') # dx_acc_container = metric_report(np.concatenate(dx_y_preds, axis=0), np.concatenate(dx_y_trues, axis=0), # args.therhold) rx_acc_container = metric_report( np.concatenate(rx_y_preds, axis=0), np.concatenate(rx_y_trues, axis=0), args.therhold) for k, v in rx_acc_container.items(): writer.add_scalar('eval/{}'.format(k), v, global_step) if rx_acc_container[acc_name] > rx_acc_best: rx_acc_best = rx_acc_container[acc_name] # save model torch.save(model_to_save.state_dict(), rx_output_model_file) with open(os.path.join(args.output_dir, 'metrics_log.txt'), 'a') as f: f.write( "epoch{}, jaccard:{}, f1:{}, prauc:{}, auc:{}\n".format( epoch, rx_acc_container['jaccard'], rx_acc_container['f1'], rx_acc_container['prauc'], rx_acc_container['auc'])) with open(os.path.join(args.output_dir, 'bert_config.json'), 'w', encoding='utf-8') as fout: fout.write(model.config.to_json_string())
def main(config_file='config/bert_config.json'): """Main method for training. Args: config_file: in config dir """ # 0. Load config and mkdir with open(config_file) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) get_path(os.path.join(config.model_path, config.experiment_name)) get_path(config.log_path) # if config.model_type == 'rnn': # build vocab for rnn # build_vocab(file_in=config.all_train_file_path, # file_out=os.path.join(config.model_path, 'vocab.txt')) # 1. Load data data = Data(vocab_file=os.path.join(config.model_path, 'vocab.txt'), max_seq_len=config.max_seq_len, model_type=config.model_type, config=config) datasets = data.load_train_and_valid_files( train_file=config.train_file_path, valid_file=config.valid_file_path) train_set, valid_set_train, valid_set_valid, train_label, valid_label = datasets if torch.cuda.is_available(): device = torch.device('cuda') # device = torch.device('cpu') # torch.distributed.init_process_group(backend="nccl") # sampler_train = DistributedSampler(train_set) sampler_train = RandomSampler(train_set) else: device = torch.device('cpu') sampler_train = RandomSampler(train_set) data_loader = { 'train': DataLoader(train_set, sampler=sampler_train, batch_size=config.batch_size), 'valid_train': DataLoader(valid_set_train, batch_size=config.batch_size, shuffle=False), 'valid_valid': DataLoader(valid_set_valid, batch_size=config.batch_size, shuffle=False), "train_label": train_label, "valid_label": valid_label } # 2. Build model model = MODEL_MAP[config.model_type](config) #load model states. if config.trained_weight: model.load_state_dict(torch.load(config.trained_weight)) model.to(device) if torch.cuda.is_available(): model = model # model = torch.nn.parallel.DistributedDataParallel( # model, find_unused_parameters=True) # 3. Train trainer = Trainer(model=model, data_loader=data_loader, device=device, config=config) best_model_state_dict = trainer.train() # 4. Save model torch.save(best_model_state_dict, os.path.join(config.model_path, 'model.bin'))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--dev_src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--dev_tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument("--dev_check_file", default=None, type=str, help="The output style response/know data file name.") parser.add_argument("--dev_style_file", default=None, type=str, help="The output style response/know data file name.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_dir", default='', type=str, required=True, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, required=True, help="The file of fine-tuned pretraining model.") parser.add_argument("--optim_recover_path", default=None, type=str, help="The file of pretraining optimizer.") parser.add_argument("--predict_bleu", default=0.5, type=float, help="The Predicted Bleu for KS Predict ") parser.add_argument("--train_vae", action='store_true', help="Whether to train vae.") # Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run ks predict.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion_step", default=300, type=int, help= "Proportion of training to perform linear learning rate warmup for. ") parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate for hidden states.") parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate for attention probabilities.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', help= "Whether to use 32-bit float precision instead of 16-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument( '--from_scratch', action='store_true', help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--max_len_a', type=int, default=0, help="Truncate_config: maximum length of segment A.") parser.add_argument('--max_len_b', type=int, default=0, help="Truncate_config: maximum length of segment B.") parser.add_argument( '--trunc_seg', default='', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument( "--mask_prob_eos", default=0, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=20, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=0, type=int, help="Number of workers for the data loader.") parser.add_argument('--mask_source_words', action='store_true', help="Whether to mask source words for training") parser.add_argument('--skipgram_prb', type=float, default=0.0, help='prob of ngram mask') parser.add_argument('--skipgram_size', type=int, default=1, help='the max size of ngram mask') parser.add_argument('--mask_whole_word', action='store_true', help="Whether masking a whole word.") parser.add_argument('--do_l2r_training', action='store_true', help="Whether to do left to right training") parser.add_argument( '--has_sentence_oracle', action='store_true', help="Whether to have sentence level oracle for training. " "Only useful for summary generation") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") parser.add_argument( '--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument( '--s2s_share_segment', action='store_true', help= "Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment)." ) parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") args = parser.parse_args() assert Path( args.model_recover_path).exists(), "--model_recover_path doesn't exist" args.output_dir = args.output_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) args.log_dir = args.log_dir.replace('[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True) handler = logging.FileHandler(os.path.join(args.log_dir, "train.log"), encoding='UTF-8') handler.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) console = logging.StreamHandler() console.setLevel(logging.DEBUG) logger.addHandler(handler) logger.addHandler(console) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs dist.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer if args.local_rank == 0: dist.barrier() C_bi_uni_pipeline = [ seq2seq_loader.C_Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift) ] logger.info("Loading Dataset from {}".format(args.data_dir)) fn_src = os.path.join(args.data_dir, args.dev_src_file) fn_tgt = os.path.join(args.data_dir, args.dev_tgt_file) dev_reddit_dataset = seq2seq_loader.C_Seq2SeqDataset( fn_src, fn_tgt, args.eval_batch_size, data_tokenizer, args.max_seq_length, file_oracle=None, bi_uni_pipeline=C_bi_uni_pipeline) if args.local_rank == -1: dev_reddit_sampler = RandomSampler(dev_reddit_dataset, replacement=False) _batch_size = args.eval_batch_size else: dev_reddit_sampler = DistributedSampler(dev_reddit_dataset) _batch_size = args.eval_batch_size // dist.get_world_size() dev_reddit_dataloader = torch.utils.data.DataLoader( dev_reddit_dataset, batch_size=_batch_size, sampler=dev_reddit_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 num_sentlvl_labels = 2 if args.has_sentence_oracle else 0 relax_projection = 4 if args.relax_projection else 0 if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() if args.model_recover_path: logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load(args.model_recover_path, map_location='cpu') model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb) if args.local_rank == 0: dist.barrier() if args.fp16: model.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) if args.local_rank != -1: try: from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("DistributedDataParallel") model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelImbalance(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if args.optim_recover_path is not None: logger.info("***** Recover optimizer from : {} *****".format( args.optim_recover_path)) optim_recover = torch.load(args.optim_recover_path, map_location='cpu') if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info("***** Recover optimizer: dynamic_loss_scale *****") optimizer.dynamic_loss_scale = True logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: pretrain_step = -1 logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) model.train() if recover_step: start_epoch = recover_step + 1 else: start_epoch = 1 for i_epoch in trange(start_epoch, int(args.num_train_epochs) + 1, desc="Epoch", disable=args.local_rank not in (-1, 0)): if args.local_rank != -1: train_sampler.set_epoch(i_epoch) logger.info("***** Running QKR evaling *****") logger.info(" Batch size = %d", args.eval_batch_size) if args.local_rank != -1: train_sampler.set_epoch(i_epoch) dev_iter_bar = tqdm(dev_reddit_dataloader, desc='Iter (loss=X.XXX)', disable=args.local_rank not in (-1, 0)) total_lm_loss = 0 for qkr_dev_step, batch in enumerate(dev_iter_bar): batch = [ t.to(device) if t is not None else None for t in batch ] if args.has_sentence_oracle: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, oracle_pos, oracle_weights, oracle_labels = batch else: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, tgt_pos, labels, ks_labels, style_ids, style_labels, check_ids = batch oracle_pos, oracle_weights, oracle_labels = None, None, None with torch.no_grad(): loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, tgt_pos=tgt_pos, labels=labels, ks_labels=ks_labels, train_vae=args.train_vae, style_ids=style_ids, style_labels=style_labels, check_ids=check_ids, pretrain=None) masked_lm_loss, next_sentence_loss, KL_loss, Mutual_loss, Golden_loss, cosine_similarity_loss, predict_kl_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. masked_lm_loss = masked_lm_loss.mean() # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) total_lm_loss += masked_lm_loss.item() # ensure that accumlated gradients are normalized total_mean_lm_loss = total_lm_loss / (qkr_dev_step + 1) print(total_mean_lm_loss) logger.info("** ** * Evaling mean loss ** ** * ") logger.info("In{}epoch,dev_lm_loss:{}".format( i_epoch, total_mean_lm_loss)) logger.info("ppl:{}".format(np.exp(total_mean_lm_loss))) logger.info("******************************************* ") break
def train(args, model, tokenizer): """ Fine-tune the pretrained model on the corpus. """ set_seed(args) # Load the data args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_dataset = load_and_cache_examples(args, tokenizer) train_sampler = RandomSampler(train_dataset) model_collate_fn = functools.partial(collate, tokenizer=tokenizer, block_size=512) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=model_collate_fn, ) # Training schedule if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = t_total // ( len(train_dataloader) // args.gradient_accumulation_steps + 1 ) else: t_total = ( len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs ) # Prepare the optimizer lr = {"encoder": 0.002, "decoder": 0.2} warmup_steps = {"encoder": 20000, "decoder": 10000} optimizer = BertSumOptimizer(model, lr, warmup_steps) # Train logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info( " Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size ) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps # * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) model.zero_grad() train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True) global_step = 0 tr_loss = 0.0 for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True) for step, batch in enumerate(epoch_iterator): source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch source = source.to(args.device) target = target.to(args.device) encoder_token_type_ids = encoder_token_type_ids.to(args.device) encoder_mask = encoder_mask.to(args.device) decoder_mask = decoder_mask.to(args.device) lm_labels = lm_labels.to(args.device) model.train() outputs = model( source, target, encoder_token_type_ids=encoder_token_type_ids, encoder_attention_mask=encoder_mask, decoder_attention_mask=decoder_mask, decoder_lm_labels=lm_labels, ) loss = outputs[0] print(loss) if args.gradient_accumulation_steps > 1: loss /= args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() model.zero_grad() global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break return global_step, tr_loss / global_step
def train( self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args device = self.device tb_writer = SummaryWriter(logdir=args["tensorboard_dir"]) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) if args["max_steps"] > 0: t_total = args["max_steps"] args["num_train_epochs"] = ( args["max_steps"] // (len(train_dataloader) // args["gradient_accumulation_steps"]) + 1) else: t_total = len(train_dataloader) // args[ "gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args["weight_decay"], }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ] }, ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args[ "warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total) if (args["model_name"] and os.path.isfile( os.path.join(args["model_name"], "optimizer.pt")) and os.path.isfile( os.path.join(args["model_name"], "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args["model_name"], "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args["model_name"], "scheduler.pt"))) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) if args["n_gpu"] > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"], mininterval=0) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args["model_name"] and os.path.exists(args["model_name"]): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args["model_name"].split("/")[-1].split( "-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // ( len(train_dataloader) // args["gradient_accumulation_steps"]) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args["gradient_accumulation_steps"]) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") if args["evaluate_during_training"]: training_progress_scores = self._create_training_progress_scores( **kwargs) if args["wandb_project"]: wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"]) wandb.watch(self.model) model.train() for current_epoch in train_iterator: if epochs_trained > 0: epochs_trained -= 1 continue # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args["n_gpu"] > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # torch.nn.utils.clip_grad_norm_( # amp.master_params(optimizer), args["max_grad_norm"] # ) else: loss.backward() # torch.nn.utils.clip_grad_norm_( # model.parameters(), args["max_grad_norm"] # ) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: if args["fp16"]: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args["max_grad_norm"]) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args[ "logging_steps"] == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step) logging_loss = tr_loss if args["wandb_project"]: wandb.log({ "Training loss": current_loss, "lr": scheduler.get_lr()[0], "global_step": global_step, }) if args["save_steps"] > 0 and global_step % args[ "save_steps"] == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"] and ( args["evaluate_during_training_steps"] > 0 and global_step % args["evaluate_during_training_steps"] == 0): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_data, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step)) if args["save_eval_checkpoints"]: self._save_model(output_dir_current, optimizer, scheduler, model=model, results=results) training_progress_scores["global_step"].append( global_step) training_progress_scores["train_loss"].append( current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False, ) if args["wandb_project"]: wandb.log( self._get_last_metrics( training_progress_scores)) if not best_eval_metric: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) if best_eval_metric and args[ "early_stopping_metric_minimize"]: if (results[args["early_stopping_metric"]] - best_eval_metric < args["early_stopping_delta"]): best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if (results[args["early_stopping_metric"]] - best_eval_metric > args["early_stopping_delta"]): best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info( " Training terminated.") train_iterator.close() return global_step, tr_loss / global_step epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args["save_model_every_epoch"] or args[ "evaluate_during_training"]: os.makedirs(output_dir_current, exist_ok=True) if args["save_model_every_epoch"]: self._save_model(output_dir_current, optimizer, scheduler, model=model) if args["evaluate_during_training"]: results = self.eval_model( eval_data, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs) self._save_model(output_dir_current, optimizer, scheduler, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False) if args["wandb_project"]: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) if best_eval_metric and args["early_stopping_metric_minimize"]: if results[args[ "early_stopping_metric"]] - best_eval_metric < args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"] and args[ "early_stopping_consider_epochs"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step else: if results[args[ "early_stopping_metric"]] - best_eval_metric > args[ "early_stopping_delta"]: best_eval_metric = results[ args["early_stopping_metric"]] self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results) early_stopping_counter = 0 else: if args["use_early_stopping"] and args[ "early_stopping_consider_epochs"]: if early_stopping_counter < args[ "early_stopping_patience"]: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args['early_stopping_metric']}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args['early_stopping_patience']}" ) else: if verbose: logger.info( f" Patience of {args['early_stopping_patience']} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) ### RANDOM INITIALIZATION #### # config = BertConfig.from_dict({ # "attention_probs_dropout_prob": 0.1, # "hidden_act": "gelu", # "hidden_dropout_prob": 0.1, # "hidden_size": 768, # "initializer_range": 0.02, # "intermediate_size": 3072, # "max_position_embeddings": 512, # "num_attention_heads": 12, # "num_hidden_layers": 12, # "type_vocab_size": 2, # "vocab_size": 30522 # }) # model = BertForSequenceClassification(config=config, num_labels=num_labels) ############################### if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits, _ = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() print(loss.item()) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits, attns = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss/nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss/nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def __init__(self, data_dir, label_dir, tokenizer, train_file='train.csv', val_file='val.csv', test_data=None, label_file='labels.csv', text_col='text', label_col='label', batch_size_per_gpu=16, max_seq_length=512, multi_gpu=True, multi_label=False, backend="nccl", model_type='bert', logger=None, clear_cache=False, no_cache=False): if isinstance(tokenizer, str): _,_,tokenizer_class = MODEL_CLASSES[model_type] # instantiate the new tokeniser object using the tokeniser name tokenizer = tokenizer_class.from_pretrained(tokenizer, do_lower_case=('uncased' in tokenizer)) self.tokenizer = tokenizer self.data_dir = data_dir self.cache_dir = data_dir/'cache' self.max_seq_length = max_seq_length self.batch_size_per_gpu = batch_size_per_gpu self.train_dl = None self.val_dl = None self.test_dl = None self.multi_label = multi_label self.n_gpu = 0 self.no_cache = no_cache self.model_type = model_type self.output_mode = 'classification' if logger is None: logger = logging.getLogger() self.logger = logger if multi_gpu: self.n_gpu = torch.cuda.device_count() if clear_cache: shutil.rmtree(self.cache_dir, ignore_errors=True) if multi_label: processor = MultiLabelTextProcessor(data_dir, label_dir) else: processor = TextProcessor(data_dir, label_dir) self.labels = processor.get_labels(label_file) if train_file: # Train DataLoader train_examples = None cached_features_file = os.path.join(self.cache_dir, 'cached_{}_{}_{}_{}'.format( self.model_type, 'train', 'multi_label' if self.multi_label else 'multi_class', str(self.max_seq_length))) if os.path.exists(cached_features_file) == False: train_examples = processor.get_train_examples( train_file, text_col=text_col, label_col=label_col) train_dataset = self.get_dataset_from_examples(train_examples, 'train') self.train_batch_size = self.batch_size_per_gpu * max(1, self.n_gpu) train_sampler = RandomSampler(train_dataset) self.train_dl = DataLoader(train_dataset, sampler=train_sampler, batch_size=self.train_batch_size) if val_file: # Validation DataLoader val_examples = None cached_features_file = os.path.join(self.cache_dir, 'cached_{}_{}_{}_{}'.format( self.model_type, 'dev', 'multi_label' if self.multi_label else 'multi_class', str(self.max_seq_length))) if os.path.exists(cached_features_file) == False: val_examples = processor.get_dev_examples( val_file, text_col=text_col, label_col=label_col) val_dataset = self.get_dataset_from_examples(val_examples, 'dev') self.val_batch_size = self.batch_size_per_gpu * max(1, self.n_gpu) val_sampler = SequentialSampler(val_dataset) self.val_dl = DataLoader(val_dataset, sampler=val_sampler, batch_size=self.val_batch_size) if test_data: # Test set loader for predictions test_examples = [] input_data = [] for index, text in enumerate(test_data): test_examples.append(InputExample(index, text)) input_data.append({ 'id': index, 'text': text }) test_dataset = self.get_dataset_from_examples(test_examples, 'test', is_test=True) self.test_batch_size = self.batch_size_per_gpu * max(1, self.n_gpu) test_sampler = SequentialSampler(test_dataset) self.test_dl = DataLoader(test_dataset, sampler=test_sampler, batch_size=self.test_batch_size)
def train(args, train_dataset, model, criterion, tokenizer): tb_writer = SummaryWriter(args.output_dir) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // len(train_dataloader) + 1 else: t_total = int(len(train_dataloader) * args.num_train_epochs) args.num_train_epochs = int(np.ceil(args.num_train_epochs)) optimizer, scheduler = get_adamw(model, t_total, args.warmup_steps, args.learning_rate, weight_decay=args.weight_decay) # if os.path.isfile(os.path.join(args.model_name_or_path, 'optimizer.pt')) and \ # os.path.isfile(os.path.join(args.model_name_or_path, 'scheduler.pt')): # optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt'))) # scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt'))) train_desc = args.task_name if args.src_genre is not None and args.src_genre != '': train_desc += '-' + args.src_genre print(f'***** Fine-tuning {args.model_name_or_path} {train_desc} *****') print(f' Num examples = {len(train_dataset)}') print(f' Num Epochs = {args.num_train_epochs}') print(f' Train batch size = {args.train_batch_size}') print(f' Total optimization steps = {t_total}') ckpt_steps = set([int(x) for x in np.linspace(0, t_total, args.num_ckpts + 1)[1:]]) model.train() model.zero_grad() global_step = 0 step_loss = [] eval_results = [] pbar = tqdm(total=t_total, desc=f'train') set_seed(args) for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(args.device) for t in batch) loss = get_loss(args.model_type, model, criterion, batch) loss.backward() if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() step_loss.append(loss.item()) global_step += 1 pbar.update(1) pbar.set_description_str(f'train: {train_desc} (loss = {step_loss[-1]:.2f}, lr = {scheduler.get_lr()[0]:.2g})') if global_step in ckpt_steps: ckpt_path = os.path.join(args.output_dir, f'step_{global_step}.bin') torch.save(model, ckpt_path) if args.do_eval: step_eval_results = evaluate(args, model, criterion, tokenizer) tr_loss = np.mean(step_loss) if len(step_eval_results) == 2: eval_loss = step_eval_results['mnli'] eval_loss_mm = step_eval_results['mnli-mm'] eval_results.append([global_step, tr_loss, eval_loss, eval_loss_mm]) print(f'\nSaving model checkpoint to {ckpt_path}, avg_loss = {tr_loss:.2f}, eval_loss = {eval_loss:.2f}, ' f'eval_loss_mm = {eval_loss_mm:.2f}\n') else: eval_loss = step_eval_results['mnli'] eval_results.append([global_step, tr_loss, eval_loss]) print(f'\nSaving model checkpoint to {ckpt_path}, avg_loss = {tr_loss:.2f}, eval_loss = {eval_loss:.2f}\n') else: print(f'\nSaving model checkpoint to {ckpt_path}\n') if global_step % args.logging_steps == 0: tb_writer.add_scalar('learning_rate', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', np.mean(step_loss), global_step) step_loss = [] if global_step == args.max_steps: pbar.close() break if args.do_eval: if len(eval_results[0]) == 4: header = ['step', 'avg_loss', 'eval_loss', 'mm_loss'] else: header = ['step', 'avg_loss', 'eval_loss'] best_results = report_results(header, eval_results, 2) best_step = best_results[0] print(f'best_ckpt = {os.path.join(args.output_dir, f"step_{best_step}.bin")}\n')
def train(model, tokenizer, train_dataset, eval_dataset, batch_size, lr, adam_epsilon, epochs, output_dir): """ :param model: Bert Model to train :param tokenizer: Bert Tokenizer to train :param train_dataset: :param batch_size: Stick to 1 if not using using a high end GPU :param lr: Suggested learning rate from paper is 5e-5 :param adam_epsilon: Used for weight decay fixed suggested parameter is 1e-8 :param epochs: Usually a single pass through the entire dataset is satisfactory :return: Loss """ train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=batch_size) train_positions_to_mask = train_dataset.positions_to_mask t_total = len(train_dataloader) // batch_size # Total Steps # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, 0, t_total) # ToDo Case for fp16 # Start of training loop logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", batch_size) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.resize_token_embeddings(len(tokenizer)) train_iterator = trange(int(epochs), desc="Epoch") epoch_info = [] proceed = False tmp_global_step = 0 for _ in train_iterator: epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration") model.train() with torch.set_grad_enabled(True): for i, batch in enumerate(epoch_iterator): if tmp_global_step >= global_step or proceed: proceed = True else: tmp_global_step += 1 if proceed: inputs, labels = custom_mask_tokens(batch, tokenizer, train_positions_to_mask[i]) inputs = inputs.to('cuda') # Don't bother if you don't have a gpu labels = labels.to('cuda') outputs = model(inputs, masked_lm_labels=labels) # model outputs are always tuple in transformers (see doc) loss = outputs[0] loss.backward() tr_loss += loss.item() # if (step + 1) % 1 == 0: # 1 here is a placeholder for gradient # accumulation steps torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if proceed: with torch.set_grad_enabled(False): epoch_info = eval_and_save_model(output_dir, eval_dataset, global_step, epoch_info, model, optimizer, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) tr_loss = 0 return model, tokenizer
optimizer = optim.Adadelta(model.params_requires_grad(), weight_decay=config.weight_decay, lr=config.learning_rate, eps=config.adam_eps) print(model) global_steps = 0. f1_best = 0. f1_test = 0. logging_loss, tr_loss = 0., 0. epoch_improve = 0. restart_used = 0 model_name = 'model_gcn_2018.ckpt' log_name = 'log_gcn2018.txt' tensorboard_name = 'model_1.ckpt' train_sampler = RandomSampler(train_dataset) train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.batch_size) total_steps = len(train_loader) * config.num_epoch scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config.warmup_steps, num_training_steps=total_steps) tb_writer = SummaryWriter(os.path.join(config.output_dir, tensorboard_name)) identity_matrix = torch.eye(config.max_sent).unsqueeze(0) print('-> Start training process') print('nepoch: ', config.num_epoch) print('total step: ', total_steps)
def run_aug(args, save_every_epoch=False): processors = { # you can your processor here "TREC": AugProcessor, "stsa.fine": AugProcessor, "stsa.binary": AugProcessor, "mpqa": AugProcessor, "rt-polarity": AugProcessor, "subj": AugProcessor, "toxic": AugProcessor } task_name = args.task_name if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) args.data_dir = os.path.join(args.data_dir, task_name) args.output_dir = os.path.join(args.output_dir, task_name) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) os.makedirs(args.output_dir, exist_ok=True) processor = processors[task_name]() label_list = processor.get_labels(task_name) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) #dev_examples = processor.get_dev_examples(args.data_dir) #train_examples.extend(dev_examples) num_train_steps = int(len(train_examples) / args.train_batch_size * args.num_train_epochs) # Prepare model def load_model(model_name): weights_path = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, model_name) model = torch.load(weights_path) return model cbert_name = "{}/BertForMaskedLM_{}_epoch_10".format(task_name.lower(), task_name.lower()) model = load_model(cbert_name) model.cuda() # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters,lr=args.learning_rate, warmup=args.warmup_proportion,t_total=t_total) global_step = 0 train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_masked_lm_labels = torch.tensor([f.masked_lm_labels for f in train_features], dtype=torch.long) train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name) if not os.path.exists(save_model_dir): os.mkdir(save_model_dir) MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0] origin_train_path = os.path.join(args.output_dir, "train_origin.tsv") save_train_path = os.path.join(args.output_dir, "train.tsv") shutil.copy(origin_train_path, save_train_path) best_test_acc = train_text_classifier.train("aug_data") print("before augment best acc:{}".format(best_test_acc)) for e in trange(int(args.num_train_epochs), desc="Epoch"): avg_loss = 0. for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.cuda() for t in batch) _, input_ids, input_mask, segment_ids, masked_ids = batch loss = model(input_ids, segment_ids, input_mask, masked_ids) loss.backward() avg_loss += loss.item() optimizer.step() model.zero_grad() if (step + 1) % 50 == 0: print("avg_loss: {}".format(avg_loss / 50)) avg_loss = 0 torch.cuda.empty_cache() shutil.copy(origin_train_path, save_train_path) save_train_file = open(save_train_path, 'a') tsv_writer = csv.writer(save_train_file, delimiter='\t') #tsv_writer.writerow(['sentence', 'label']) for step, batch in enumerate(train_dataloader): model.eval() batch = tuple(t.cuda() for t in batch) init_ids, _, input_mask, segment_ids, _ = batch input_lens = [sum(mask).item() for mask in input_mask] #masked_idx = np.squeeze([np.random.randint(1, l-1, 1) for l in input_lens]) masked_idx = np.squeeze([np.random.randint(0, l, max(l//7,2)) for l in input_lens]) for ids, idx in zip(init_ids,masked_idx): ids[idx] = MASK_id predictions = model(init_ids, segment_ids, input_mask) for ids, idx, preds, seg in zip(init_ids, masked_idx, predictions, segment_ids): #pred = torch.argsort(pred)[:,-e-1][idx] ''' pred = torch.argsort(preds)[:,-1][idx] ids[idx] = pred new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) new_str = rev_wordpiece(new_str) tsv_writer.writerow([new_str, seg[0].item()]) ''' pred = torch.argsort(preds)[:, -2][idx] ids[idx] = pred new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) new_str = rev_wordpiece(new_str) tsv_writer.writerow([new_str, seg[0].item()]) torch.cuda.empty_cache() predictions = predictions.detach().cpu() torch.cuda.empty_cache() bak_train_path = os.path.join(args.output_dir, "train_epoch_{}.tsv".format(e)) shutil.copy(save_train_path, bak_train_path) best_test_acc = train_text_classifier.train("aug_data") print("epoch {} augment best acc:{}".format(e, best_test_acc)) if save_every_epoch: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path) else: if (e + 1) % 10 == 0: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str(e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path)
def train(model, optimizer, scheduler, dataset_train, dataset_valid, dataset_test, config, evaluator): # valid_feed can be None patience = 10 # wait for at least 10 epoch before stop valid_loss_threshold = np.inf best_valid_loss = np.inf best_eval_valid = 0.0 final_eval_best = 0.0 sampler_train = RandomSampler(dataset_train) if config.local_rank == -1 else DistributedSampler(dataset_train) # sampler_train = SequentialSampler(dataset_train) if config.local_rank == -1 else DistributedSampler(dataset_train) dataloader_train = DataLoader(dataset_train, sampler=sampler_train, batch_size=config.batch_size) batch_cnt = 0 ckpt_step = len(dataloader_train.dataset) // dataloader_train.batch_size logger.info("**** Training Begins ****") logger.info("**** Epoch 0/{} ****".format(config.max_epoch)) loss_func = None # if config.use_parallel and not config.use_apex: # if config.n_gpu > 1 and not config.use_apex: if config.n_gpu > 1 and config.local_rank == -1: loss_func = get_loss_func(config=config, pad_id=model.module.pad_id) else: loss_func = get_loss_func(config=config, pad_id=model.pad_id) if config.use_gpu: loss_func.cuda() # epoch loop model.train() for cur_epoch in range(config.max_epoch): # loop until traverse all batches train_loss = [] for text_inputs, label_y, *remains in dataloader_train: mask_input = remains[0] len_seq = remains[1] len_sents = remains[2] tid = remains[3] len_para = remains[4] text_inputs = utils.cast_type(text_inputs, LONG, config.use_gpu) mask_input = utils.cast_type(mask_input, FLOAT, config.use_gpu) len_seq = utils.cast_type(len_seq, FLOAT, config.use_gpu) # training for this batch optimizer.zero_grad() coh_score = model(text_inputs=text_inputs, mask_input=mask_input, len_seq=len_seq, len_sents=len_sents, tid=tid, len_para=len_para, mode="") # model.forward; now it returns the loss if config.output_size == 1: coh_score = coh_score.view(text_inputs.shape[0]) else: coh_score = coh_score.view(text_inputs.shape[0], -1) # # get loss if config.output_size == 1: label_y = utils.cast_type(label_y, FLOAT, config.use_gpu) else: label_y = utils.cast_type(label_y, LONG, config.use_gpu) label_y = label_y.view(text_inputs.shape[0]) # print(coh_score) # print(label_y) loss = loss_func(coh_score, label_y) if config.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training loss.backward() # with amp_handle.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() # print(coh_score) # print(label_y) # print() torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) #for p in model.parameters(): # gradient control manually # if p.grad is not None: # p.data.add_(-config.init_lr, p.grad.data) # clip_grad_norm_(amp.master_params(optimizer), config.clip_n2) # update optimizer and scheduler optimizer.step() if scheduler is not None: # scheduler.step() scheduler.step(loss) # for param_group in optimizer.param_groups: # print(param_group['lr']) train_loss.append(loss.item()) # temporal averaging for encoders (proposed in ICLR18) if config.encoder_type == "reg_lstm" and config.beta_ema > 0: if config.n_gpu > 1: model.module.encoder_coh.update_ema() else: model.encoder_coh.update_ema() batch_cnt = batch_cnt + 1 # print train process if batch_cnt % config.print_step == 0: logger.info("{}/{}-({:.3f})".format(batch_cnt % config.ckpt_step, config.ckpt_step, loss)) ## validation if batch_cnt % ckpt_step == 0: # manual epoch printing # if i == batch_num-1: # every epoch logger.info("\n=== Evaluating Model ===") # validation eval_cur_valid = -1 if dataset_valid is not None: loss_valid, eval_cur_valid, _ = validate(model, evaluator, dataset_valid, config, loss_func) logger.info("") if eval_cur_valid >= best_eval_valid or dataset_valid is None: # if dataset_valid is not None: logger.info("Best {} on Valid {}".format(evaluator.eval_type, eval_cur_valid)) best_eval_valid = eval_cur_valid valid_loss, eval_last, eval_best = validate(model, evaluator, dataset_test, config, loss_func, is_test=True) if eval_best > final_eval_best: final_eval_best = eval_best # save model if config.save_model: logger.info("Model Saved.") torch.save(model.state_dict(), os.path.join(config.session_dir, "model")) # save prediction log for error analysis if config.gen_logs: pred_log_name = "log_pred_" + str(config.essay_prompt_id_train) + "_" + str(config.essay_prompt_id_test) + "_" + str(config.cur_fold) + ".log" if config.eval_type.lower() == "qwk": pred_out = np.stack((evaluator.rescaled_pred, evaluator.origin_label_np, evaluator.tid_np)) np.savetxt(os.path.join(config.session_dir, pred_log_name), pred_out, fmt ='%.0f') elif config.eval_type.lower() == "accuracy": pred_out = np.stack((evaluator.pred_list_np, evaluator.origin_label_np, evaluator.tid_np)) pred_out = pred_out.T np.savetxt(os.path.join(config.session_dir, pred_log_name), pred_out, fmt ='%.0f') # # error analysis: std data for lexical cohesion # if config.gen_logs and config.target_model == "ilcr_scd": # std_log_name = "log_std_" + str(config.essay_prompt_id_train) + "_" + str(config.essay_prompt_id_test) + "_" + str(config.cur_fold) + ".log" # # # file read # std_data = evaluator.map_suppl["std"] # with open(os.path.join(config.session_dir, std_log_name), "w") as f: # f.write(repr(std_data)) evaluator.map_suppl={} # reset # early stopping parts (disabled) # if valid_loss < best_valid_loss: # if valid_loss <= valid_loss_threshold * config.improve_threshold: # patience = max(patience, # cur_epoch * config.patient_increase) # valid_loss_threshold = valid_loss # logger.info("Update patience to {}".format(patience)) # # end if if valid_loss <= valid_loss_threshold * config.improve_threshold # # best_valid_loss = valid_loss # # end if valid_loss < best_valid_loss: # if cur_epoch >= config.max_epoch \ # or config.early_stop and patience <= cur_epoch: # if cur_epoch < config.max_epoch: # logger.info("!!Early stop due to run out of patience!!") # # logger.info("Best validation loss %f" % best_valid_loss) # # return # end if if cur_epoch >= config.max_epoch \ # exit eval model model.train() train_loss = [] logger.info("\n**** Epcoch {}/{} ****".format(cur_epoch, config.max_epoch)) # end valdation if config.use_gpu and config.empty_cache: torch.cuda.empty_cache() # due to memory shortage # end batch loop # end epoch loop logger.info("Best {} on Test {}".format(evaluator.eval_type, final_eval_best)) logger.info("") return final_eval_best
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility (even between python 2 and 3) for _ in train_iterator: loss_this_epoch = 0 epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): inputs, labels, attention_mask = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) attention_mask = attention_mask.to(args.device) model.train() outputs = model(inputs, attention_mask=attention_mask, masked_lm_labels=labels) # if args.mlm else model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) loss_this_epoch = loss + loss_this_epoch if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break print ('\ntrain loss epoch ... {} (not exact loss)'.format(loss_this_epoch/step)) if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Required parameters parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--train_ans_file", default=None, type=str, help="SQuAD answer for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=32, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--restore', default=False) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # torch.backends.cudnn.benchmark = True n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) == False: # raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) import pickle as cPickle train_examples = None num_train_steps = None if args.do_train: raw_test_data = open(args.predict_file, mode='r') raw_train_data = open(args.train_file, mode='r') if os.path.exists("train_file_baseline.pkl") and False: train_examples = cPickle.load( open("train_file_baseline.pkl", mode='rb')) else: ans_dict = {} with open(args.train_ans_file) as f: for line in f: line = line.split(',') ans_dict[line[0]] = int(line[1]) train_examples = read_chid_examples(raw_train_data, is_training=True, ans_dict=ans_dict) cPickle.dump(train_examples, open("newtrain_file_baseline.pkl", mode='wb')) #tt = len(train_examples) // 2 #train_examples = train_examples[:tt] logger.info("train examples {}".format(len(train_examples))) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model bert_config = BertConfig.from_json_file(args.bert_config_file) tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) model = BertForCloze(bert_config, num_choices=10) if args.init_checkpoint is not None: logger.info('load bert weight') state_dict = torch.load(args.init_checkpoint, map_location='cpu') missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() # new_state_dict=state_dict.copy() # for kye ,value in state_dict.items(): # new_state_dict[kye.replace("bert","c_bert")]=value # state_dict=new_state_dict if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): # logger.info("name {} chile {}".format(name,child)) if child is not None: load(child, prefix + name + '.') load(model, prefix='' if hasattr(model, 'bert') else 'bert.') logger.info("missing keys:{}".format(missing_keys)) logger.info('unexpected keys:{}'.format(unexpected_keys)) logger.info('error msgs:{}'.format(error_msgs)) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex import amp from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate) # optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=t_total) # optimizer = RAdam(optimizer_grouped_parameters, # lr=args.learning_rate) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.restore: checkpoint = torch.load('amp_checkpoint.pt') model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) amp.load_state_dict(checkpoint['amp']) global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_v{1}'.format( str(args.max_seq_length), str(4)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_labels = torch.tensor([f.label for f in train_features], dtype=torch.long) all_option_ids = torch.tensor([f.option_ids for f in train_features], dtype=torch.long) all_positions = torch.tensor([f.position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_labels, all_option_ids, all_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True, pin_memory=True) loss_ini = 50 for _ in trange(int(args.num_train_epochs), desc="Epoch"): vizname = 'epoch' + str(_) viz = Visdom(env=str(vizname)) vis = Visdom(env='loss') via = Visdom(env='ac') model.train() model.zero_grad() epoch_itorator = tqdm(train_dataloader, disable=None) for step, batch in enumerate(epoch_itorator): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, labels, option_ids, positions = batch loss = model(input_ids, option_ids, segment_ids, input_mask, positions, labels) # print('att', loss.size()) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # model, optimizer = amp.initialize(model, optimizer, opt_level= "O1") with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step # if args.fp16: # torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.) optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % 1000 == 0: logger.info("loss@{}:{}".format(step, loss.cpu().item())) steptotal = step + _ * int( len(train_examples) / args.train_batch_size) if (steptotal + 1) % 50 == 0: vis.line([loss.cpu().item()], [steptotal], win='train_loss', update='append') if (step + 1) % 50 == 0: viz.line([loss.cpu().item()], [step], win='train_loss', update='append') loss_total = str(loss.cpu().item()) print(loss_total) loss_ini = loss_total logger.info("loss:%f", loss.cpu().item()) logger.info("loss+:{}".format(loss.cpu().item())) raw_test_data_pre = open(args.predict_file, mode='r') eval_examples = read_chid_examples(raw_test_data_pre, is_training=False) # eval_examples=eval_examples[:100] eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_option_ids = torch.tensor( [f.option_ids for f in eval_features], dtype=torch.long) all_positions = torch.tensor([f.position for f in eval_features], dtype=torch.long) all_tags = torch.tensor([f.tag for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_option_ids, all_positions, all_tags) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() reader1 = pd.read_csv('dev_answer1.csv', usecols=[1], header=None) total_dev_loss = 0 all_results = {} logger.info("Start evaluating") for input_ids, input_mask, segment_ids, option_ids, positions, tags in \ tqdm(eval_dataloader, desc="Evaluating",disable=None): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) option_ids = option_ids.to(device) positions = positions.to(device) with torch.no_grad(): batch_logits, align = model(input_ids, option_ids, segment_ids, input_mask, positions) for i, tag in enumerate(tags): logits = batch_logits[i].detach().cpu().numpy() logit = [logits] logit = torch.tensor(logit) inum = int(tag) - 577157 dlabel = [reader1[1][inum]] dlabel = torch.tensor(dlabel) # loss_dev =FocalLoss(gamma=0.25) loss_dev = CrossEntropyLoss() dev_loss = loss_dev(logit, dlabel) total_dev_loss += dev_loss # for index1, dlabel in zip(reader1[0], reader1[1]): # if index1[6:11] == str(tag): # loss_dev =CrossEntropyLoss() # dev_loss = loss_dev(logits, dlabel) # total_dev_loss += dev_loss # continue ans = np.argmax(logits) all_results["#idiom%06d#" % tag] = ans predict_name = "ln11saprediction" + str(_) + ".csv" output_prediction_file = os.path.join(args.output_dir, predict_name) with open(output_prediction_file, "w") as f: for each in all_results: f.write(each + ',' + str(all_results[each]) + "\n") raw_test_data.close() pre_ac = 0 outputpre = 'output_model/' + predict_name reader2 = pd.read_csv(outputpre, usecols=[0, 1], header=None) for index2, ans2 in zip(reader2[0], reader2[1]): num = index2[6:12] num = int(num) - 577157 ans1 = reader1[1][num] if ans1 == ans2: pre_ac += 1 print(pre_ac) per = (pre_ac) / 23011 pernum = per * 100 logger.info("accuracy:%f", pernum) devlossmean = total_dev_loss / (23011 / 128) logger.info("devloss:%f", devlossmean) via.line([pernum], [_], win='accuracy', update='append') via.line([devlossmean], [_], win='loss', update='append') checkpoint = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'amp': amp.state_dict() } torch.save(checkpoint, 'checkpoint/amp_checkpoint.pt') outmodel = 'ln11samodel' + str(pernum) + '.bin' output_model_file = os.path.join(args.output_dir, outmodel) if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) raw_test_data.close() raw_train_data.close() # Save a trained model # output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # if args.do_train: # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): list1 = os.listdir('./output_model/') list1 = sorted( list1, key=lambda x: os.path.getmtime(os.path.join('./output_model/', x))) output_model_file = os.path.join(args.output_dir, list1[-1]) # output_model_file = os.path.join(args.output_dir, 'n11samodel77.33258007040111.bin') model_state_dict = torch.load(output_model_file) model = BertForCloze(bert_config, num_choices=10) model.load_state_dict(model_state_dict) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # raw_test_data_pre = open('./data/dev.txt', mode='r') raw_test_data_pre = open('./data/out.txt', mode='r') # raw_test_data_pre = open('new_test_data.txt', mode='r') eval_examples = read_chid_examples(raw_test_data_pre, is_training=False) # eval_examples=eval_examples[:100] eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_option_ids = torch.tensor([f.option_ids for f in eval_features], dtype=torch.long) all_positions = torch.tensor([f.position for f in eval_features], dtype=torch.long) all_tags = torch.tensor([f.tag for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_option_ids, all_positions, all_tags) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = {} all_results1 = {} all_results2 = {} # reader1 = pd.read_csv('test_ans.csv', usecols=[1], header=None) reader1 = pd.read_csv('./data/out_answer.csv', usecols=[1], header=None) #dev_answer1.csv # reader1 = pd.read_csv('dev_answer1.csv', usecols=[1], header=None) total_dev_loss = 0 logger.info("Start evaluating") for input_ids, input_mask, segment_ids, option_ids, positions, tags in \ tqdm(eval_dataloader, desc="Evaluating",disable=None): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) option_ids = option_ids.to(device) positions = positions.to(device) with torch.no_grad(): batch_logits, align = model(input_ids, option_ids, segment_ids, input_mask, positions) for i, tag in enumerate(tags): logits = batch_logits[i].detach().cpu().numpy() ans = np.argmax(logits) all_results["#idiom%06d#" % tag] = ans # matric = align[i].detach().cpu().numpy() # all_results1["#idiom%06d#" % tag] = matric[ans] # gr_logic = logits[:] # gr_logic = sorted(gr_logic, reverse=True) # all_results2["#idiom%06d#" % tag] = gr_logic output_prediction_file = os.path.join(args.output_dir, "testprediction.csv") # output_m_file = os.path.join(args.output_dir, "ealign.csv") # output_ma_file = os.path.join(args.output_dir, "sdmv.csv") with open(output_prediction_file, "w") as f: for each in all_results: f.write(each + ',' + str(all_results[each]) + "\n") # with open(output_m_file, "w") as f: # for each in all_results1: # f.write(each + ',' + str(all_results1[each]) + "\n") # with open(output_ma_file, "w") as f: # for each in all_results1: # f.write(each + ',' + str(all_results2[each]) + "\n") raw_test_data_pre.close() reader2 = pd.read_csv(output_prediction_file, usecols=[0, 1], header=None) pre_ac = 0 for index2, ans2 in zip(reader2[0], reader2[1]): num = index2[6:-1] # num = int(num)-1 # num = re.findall(r"\d+\.?\d*",index2) num = int(num) - 623377 # num = int(num) - 577157 ans1 = reader1[1][num] if ans1 == ans2: pre_ac += 1 print(pre_ac) # per = (pre_ac)/23011 # per = (pre_ac)/24948 per = (pre_ac) / 27704 pernum = per * 100 logger.info("accuracy:%f", pernum)
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(comment=args.summary_comment) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) if args.wiki_dataset: collate_fn = functools.partial(collate_wiki, tokenizer) else: collate_fn = functools.partial(collate, tokenizer) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch, ) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model_to_resize = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue if args.wiki_dataset: if args.mlm: raise RuntimeError( "Can't do mlm for wiki / dictionary dataset") tokens, loss_mask = batch inputs, labels = (tokens, tokens) loss_mask = loss_mask.to(args.device) loss_weights = (~loss_mask) + loss_mask * args.title_scale inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, labels=labels, loss_weights=loss_weights) else: inputs, labels = mask_tokens( batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model( inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args.logging_steps, global_step, ) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(args): def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # logger logger_file_name = args.save_dir.split('/')[1] fileHandler = logging.FileHandler( os.path.join(args.save_dir, "%s.txt" % (logger_file_name))) logger.addHandler(fileHandler) logger.info(args) # cuda setup device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logger.info("device: {}".format(device)) # set random seed np.random.seed(args.random_seed) random.seed(args.random_seed) torch.manual_seed(args.random_seed) if device == "cuda": torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True #****************************************************** # load data #****************************************************** processor = Processor(args) slot_meta = processor.slot_meta label_list = processor.label_list num_labels = [len(labels) for labels in label_list] logger.info(slot_meta) tokenizer = BertTokenizer.from_pretrained(args.pretrained_model) train_data_raw = processor.get_train_instances(args.data_dir, tokenizer) print("# train examples %d" % len(train_data_raw)) dev_data_raw = processor.get_dev_instances(args.data_dir, tokenizer) print("# dev examples %d" % len(dev_data_raw)) test_data_raw = processor.get_test_instances(args.data_dir, tokenizer) print("# test examples %d" % len(test_data_raw)) logger.info("Data loaded!") train_data = MultiWozDataset(train_data_raw, tokenizer, word_dropout=args.word_dropout) num_train_steps = int( len(train_data_raw) / args.train_batch_size * args.n_epochs) logger.info("***** Run training *****") logger.info(" Num examples = %d", len(train_data_raw)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=train_data.collate_fn, num_workers=args.num_workers, worker_init_fn=worker_init_fn) #****************************************************** # build model #****************************************************** ## Initialize slot and value embeddings sv_encoder = UtteranceEncoding.from_pretrained(args.pretrained_model) for p in sv_encoder.bert.parameters(): p.requires_grad = False new_label_list, slot_value_pos = combine_slot_values( slot_meta, label_list) # without slot head logger.info(slot_value_pos) slot_lookup = get_label_lookup_from_first_token(slot_meta, tokenizer, sv_encoder, device) value_lookup = get_label_lookup_from_first_token(new_label_list, tokenizer, sv_encoder, device) model = BeliefTracker(args, slot_lookup, value_lookup, num_labels, slot_value_pos, device) model.to(device) ## prepare optimizer no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] enc_param_optimizer = list(model.encoder.named_parameters()) enc_optimizer_grouped_parameters = [{ 'params': [ p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) enc_scheduler = get_linear_schedule_with_warmup( enc_optimizer, int(num_train_steps * args.enc_warmup), num_train_steps) dec_param_optimizer = list(model.decoder.parameters()) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) dec_scheduler = get_linear_schedule_with_warmup( dec_optimizer, int(num_train_steps * args.dec_warmup), num_train_steps) logger.info(enc_optimizer) logger.info(dec_optimizer) #****************************************************** # training #****************************************************** logger.info("Training...") best_loss = None best_acc = None last_update = None for epoch in trange(int(args.n_epochs), desc="Epoch"): batch_loss = [] batch_acc = [] for step, batch in enumerate(tqdm(train_dataloader)): model.train() batch = [b.to(device) if b is not None else b for b in batch] input_ids, segment_ids, input_mask, label_ids = batch # forward loss, _, acc, _, _ = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) loss.backward() enc_optimizer.step() enc_scheduler.step() dec_optimizer.step() dec_scheduler.step() model.zero_grad() batch_loss.append(loss.item()) batch_acc.append(acc) if step % 300 == 0: print("[%d/%d] [%d/%d] mean_loss: %.6f, mean_joint_acc: %.6f" % \ (epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), np.mean(batch_acc))) batch_loss = [] batch_acc = [] if epoch > args.n_epochs / 2 and step > 0 and step % args.eval_step == 0: eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, label_list, epoch * 10 + step / args.eval_step) if last_update is None or best_loss > eval_res['loss']: best_loss = eval_res['loss'] save_path = os.path.join(args.save_dir, 'model_best_loss.bin') torch.save(model.state_dict(), save_path) print("Best Loss : ", best_loss) print("\n") if last_update is None or best_acc < eval_res['joint_acc']: best_acc = eval_res['joint_acc'] save_path = os.path.join(args.save_dir, 'model_best_acc.bin') torch.save(model.state_dict(), save_path) print("Best Acc : ", best_acc) print("\n") logger.info("*** Step=%d, Dev Loss=%.6f, Dev Acc=%.6f, Dev Turn Acc=%.6f, Best Loss=%.6f, Best Acc=%.6f ***" % \ (step, eval_res['loss'], eval_res['joint_acc'], eval_res['joint_turn_acc'], best_loss, best_acc)) if epoch > args.n_epochs / 2 and step > 0 and step % args.eval_step == 0: eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, \ label_list, epoch*10+step/args.eval_step) logger.info("*** Step=%d, Tes Loss=%.6f, Tes Acc=%.6f, Tes Turn Acc=%.6f, Best Loss=%.6f, Best Acc=%.6f ***" % \ (step, eval_res['loss'], eval_res['joint_acc'], eval_res['joint_turn_acc'], best_loss, best_acc)) if (epoch + 1) % args.eval_epoch == 0: eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, label_list, epoch + 1) if last_update is None or best_loss > eval_res['loss']: best_loss = eval_res['loss'] save_path = os.path.join(args.save_dir, 'model_best_loss.bin') torch.save(model.state_dict(), save_path) print("Best Loss : ", best_loss) print("\n") if last_update is None or best_acc < eval_res['joint_acc']: best_acc = eval_res['joint_acc'] save_path = os.path.join(args.save_dir, 'model_best_acc.bin') torch.save(model.state_dict(), save_path) last_update = epoch print("Best Acc : ", best_acc) print("\n") logger.info( "*** Epoch=%d, Last Update=%d, Dev Loss=%.6f, Dev Acc=%.6f, Dev Turn Acc=%.6f, Best Loss=%.6f, Best Acc=%.6f ***" % (epoch, last_update, eval_res['loss'], eval_res['joint_acc'], eval_res['joint_turn_acc'], best_loss, best_acc)) if (epoch + 1) % args.eval_epoch == 0: eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, label_list, epoch + 1) logger.info( "*** Epoch=%d, Last Update=%d, Tes Loss=%.6f, Tes Acc=%.6f, Tes Turn Acc=%.6f, Best Loss=%.6f, Best Acc=%.6f ***" % (epoch, last_update, eval_res['loss'], eval_res['joint_acc'], eval_res['joint_turn_acc'], best_loss, best_acc)) if last_update + args.patience <= epoch: break print("Test using best loss model...") best_epoch = 0 ckpt_path = os.path.join(args.save_dir, 'model_best_loss.bin') model = BeliefTracker(args, slot_lookup, value_lookup, num_labels, slot_value_pos, device) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) test_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, label_list, best_epoch, is_gt_p_state=False) logger.info("Results based on best loss: ") logger.info(test_res) #---------------------------------------------------------------------- print("Test using best acc model...") ckpt_path = os.path.join(args.save_dir, 'model_best_acc.bin') model = BeliefTracker(args, slot_lookup, value_lookup, num_labels, slot_value_pos, device) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) test_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, label_list, best_epoch + 1, is_gt_p_state=False) logger.info("Results based on best acc: ") logger.info(test_res)
def train(args, train_dataset, model, tokenizer, ori_dict): record_result = [] """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) pruning_step = 0 global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in ["xlm", "roberta", "distilbert", "camembert"]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr(model.config, "lang2id"): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: rate_weight_equal_zero = see_weight_rate(model) print('zero_rate = ', rate_weight_equal_zero) results = evaluate(args, model, tokenizer) print(results) record_result.append(results) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss pruning_model(model, 1/(10-pruning_step)) rate_weight_equal_zero = see_weight_rate(model) pruning_step += 1 print('zero_rate = ', rate_weight_equal_zero) print('starting rewinding') model_dict = model.state_dict() model_dict.update(ori_dict) model.load_state_dict(model_dict) print('optimizer rewinding') no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Save model checkpoint if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(model, os.path.join(output_dir, "model.pt")) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if pruning_step == 10: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if pruning_step == 10: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() torch.save(record_result, os.path.join(args.output_dir, "result.pt")) return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task.", ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints will be written.", ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), "distributed_{}".format(args.local_rank)), num_choices=4, ) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: # Prepare data loader train_examples = read_swag_examples(os.path.join( args.data_dir, "train.csv"), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) all_input_ids = torch.tensor(select_field(train_features, "input_ids"), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, "input_mask"), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, "segment_ids"), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = (len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs) if args.local_rank != -1: num_train_optimization_steps = (num_train_optimization_steps // torch.distributed.get_world_size()) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0, ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, ) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = (model.module if hasattr(model, "module") else model ) # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=4) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, "val.csv"), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, "input_ids"), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, "input_mask"), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, "segment_ids"), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to("cpu").numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { "eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "global_step": global_step, "loss": tr_loss / global_step, } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForQuestionAnswering.from_pretrained(args.bert_model) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_dev_acc = 0.0 best_steps = 0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None, # XLM don't use segment_ids "labels": batch[3], } outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if (args.evaluate_during_training): results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) if results["eval_acc"] > best_dev_acc: best_dev_acc = results["eval_acc"] best_steps = global_step if args.do_test: results_test = evaluate(args, model, tokenizer, test=True) for key, value in results_test.items(): tb_writer.add_scalar("test_{}".format(key), value, global_step) logger.info( "test acc: %s, loss: %s, global steps: %s", str(results_test["eval_acc"]), str(results_test["eval_loss"]), str(global_step), ) file0 = open("train_eval_logs_sqa_FIXED_180_tl.txt", "a") file0.write(str(results["eval_acc"]) + ','\ + str(results["eval_loss"]) + "," + \ str(global_step) + "\n") file0.close() tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logger.info( "Average loss: %s at global step: %s", str((tr_loss - logging_loss) / args.logging_steps), str(global_step), ) file1 = open("train_loss_logs_sqa_FIXED_180_tl.txt", "a") # append mode file1.write(str((tr_loss - logging_loss) / args.logging_steps) + "," + \ str(global_step) + "\n") file1.close() logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_vocabulary(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step, best_steps
random_state=20, test_size=0.1) Mask_train, Mask_valid, _, _ = train_test_split(attention_masks, X, random_state=20, test_size=0.1) X_train = torch.tensor(X_train).to(device).long() X_valid = torch.tensor(X_valid).to(device).long() Y_train = torch.tensor(Y_train).to(device).long() Y_valid = torch.tensor(Y_valid).to(device).long() Mask_train = torch.tensor(Mask_train).to(device).long() Mask_valid = torch.tensor(Mask_valid).to(device).long() data_train = TensorDataset(X_train, Mask_train, Y_train) data_train_sampler = RandomSampler(data_train) DL_train = DataLoader(data_train, sampler=data_train_sampler, batch_size=batch_s) data_valid = TensorDataset(X_valid, Mask_valid, Y_valid) data_valid_sampler = SequentialSampler(data_valid) DL_valid = DataLoader(data_valid, sampler=data_valid_sampler, batch_size=batch_s) model = BertForTokenClassification.from_pretrained("hfl/chinese-bert-wwm", num_labels=len(tag2idx)) model.cuda() FULL_FINETUNING = False