def main(args): init_logger() tokenizer = load_tokenizer(args) train_dataset = None if args.do_predict else load_and_cache_examples( args, tokenizer, mode="train") dev_dataset = None test_dataset = None if args.do_predict else load_and_cache_examples( args, tokenizer, mode="test") if args.do_train: trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) trainer.train() if args.do_eval: trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) trainer.load_model() trainer.evaluate("test") if args.do_predict: predict = Predict(args, tokenizer) predict.load_model() sentences = [args.sentence] result_json = dict() result_json['result'] = int(predict.predict(sentences)) print(json.dumps(result_json, ensure_ascii=False))
def main(args): init_logger() set_seed(args) if args.logger: neptune.init("wjdghks950/NumericHGN") neptune.create_experiment(name="({}) NumHGN_{}_{}_{}".format( args.task, args.train_batch_size, args.max_seq_len, args.train_file)) neptune.append_tag("BertForSequenceClassification", "finetuning", "num_augmented_HGN") tokenizer = load_tokenizer(args) train_dataset = dev_dataset = test_dataset = None if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, mode="train") dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev") # test_dataset = load_and_cache_examples(args, tokenizer, mode="test") trainer = ParaSelectorTrainer(args, train_dataset, dev_dataset) if args.do_train: trainer.train() trainer.save_model() if args.do_eval: trainer.load_model() trainer.evaluate("dev") if args.logger: neptune.stop()
def main(args): init_logger() set_seed(args) tokenizer = load_tokenizer(args) train_dataset = None dev_dataset = None test_dataset = None if args.do_train or args.do_eval: test_dataset = load_and_cache_examples(args, tokenizer, mode="test") if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, mode="train") trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) predictor = Predict(args) if args.do_train: trainer.train() if args.do_eval: trainer.load_model() trainer.evaluate("test", "eval") if args.do_predict: predictor.predict()
def main(args): # def main(): init_logger()#输出信息 # 随机数种子seed确定时,模型的训练结果将始终保持一致 set_seed(args) tokenizer = load_tokenizer(args)#加载预训练模型 device_ = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_dataset = load_and_cache_examples(args, tokenizer, mode="train")#获取样本特征 dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev") test_dataset = load_and_cache_examples(args, tokenizer, mode="test") # snips_tensors表示一个二维矩阵,代表了snips数据集中每句话对应的CLS输出 ''' 数据格式 TensorDataset(all_corpus, all_input_ids, all_attention_mask, all_token_type_ids, all_intent_label_ids, all_slot_labels_ids) ''' trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) # trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) if args.do_train: trainer.train() trainer.evaluate("test")
def main(CFG, args): random.seed(CFG.seed) np.random.seed(CFG.seed) torch.manual_seed(CFG.seed) torch.cuda.manual_seed_all(CFG.seed) os.environ["PYTHONHASHSEED"] = str(CFG.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True #Set Tokenizer and Model tokenizer = transformers.AutoTokenizer.from_pretrained(CFG.MODEL_NAME) tokenizer.add_tokens(['<e1>', '</e1>', '<e2>', '</e2>'], special_tokens=True) # model.resize_token_embeddings(tokenizer.vocab_size + 4) print(tokenizer) train_dataset = load_and_cache_examples(args, tokenizer, mode="train") test_dataset = load_and_cache_examples(args, tokenizer, mode="test") print(train_dataset) print(test_dataset) trainer = Trainer(CFG, args, train_dataset=train_dataset, test_dataset=test_dataset) if args.do_train: trainer.train() if args.do_eval: trainer.load_model() trainer.evaluate("test")
def main(args): init_logger() tokenizer = load_tokenizer(args) train_dataset = None dev_dataset = None test_dataset = None if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, mode="train") dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev") if args.do_eval: test_dataset = load_and_cache_examples(args, tokenizer, mode="test") trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) if args.do_train: trainer.train() if args.do_eval: trainer.load_model() trainer.evaluate("test") if args.do_pred: trainer.load_model() texts = read_prediction_text(args) trainer.predict(texts, tokenizer)
def main(): if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # ------------------判断CUDA模式---------------------- # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') n_gpu = 1 #produce data train_batch_size = args.per_gpu_train_batch_size * max(1, n_gpu) eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu) train_iter = load_and_cache_examples(mode='train',train_batch_size=train_batch_size, eval_batch_size=eval_batch_size) eval_iter = load_and_cache_examples(mode='dev', train_batch_size=train_batch_size, eval_batch_size=eval_batch_size) #epoch_size = num_train_steps * train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs # pbar = ProgressBar(epoch_size=epoch_size, # batch_size=train_batch_size) if args.model_type == 'bert': model = Bert_SenAnalysis.from_pretrained(args.bert_model, num_tag = len(args.labels)) elif args.model_type == 'xlnet': config = XLNetConfig.from_pretrained(args.xlnet_model, num_labels = len(args.labels)) model = XLNet_SenAnalysis.from_pretrained(args.xlnet_model, config=config) for name, param in model.named_parameters(): if param.requires_grad: print(name) train_iter = cycle(train_iter) fit(model = model, training_iter=train_iter, eval_iter=eval_iter, #train_steps=args.train_steps, #pbar=pbar, num_train_steps=args.train_steps,#num_train_steps, device=device, n_gpu=n_gpu, verbose=1)
def main(args): if os.path.exists(args.model_dir) and len(os.listdir(args.model_dir)) > 0: print("The model output path '%s' already exists and is not empty." % args.model_dir) return init_logger(args) set_seed(args) tokenizer = load_tokenizer(args.model_name_or_path) logger.info("******* Running with the following arguments *********") for a in vars(args): logger.info(a + " = " + str(getattr(args, a))) logger.info("***********************************************") train_dataset, train_examples = load_and_cache_examples(args, tokenizer, mode="train") train_examples = dict([(example.guid, example) for example in train_examples]) dev_dataset, dev_examples = load_and_cache_examples(args, tokenizer, mode="dev") dev_examples = dict([(example.guid, example) for example in dev_examples]) test_dataset, test_examples = load_and_cache_examples(args, tokenizer, mode="test") test_examples = dict([(example.guid, example) for example in test_examples]) if args.align_languages: alignment_dataset = generate_alignment_pairs(args=args) else: alignment_dataset = None trainer = Trainer(args, train_dataset, dev_dataset, test_dataset, train_examples, dev_examples, test_examples, tokenizer, alignment_dataset) if args.do_train: trainer.load_model(final_eval=False) logger.info(trainer.model) trainer.train() if args.task == Tasks.PAWS_X.value: trainer.evaluate_pair("dev", exp_name=args.model_dir) else: trainer.evaluate_xnlu("dev", exp_name=args.model_dir) if args.save_model: trainer.save_model() if args.do_eval: if not args.do_train: trainer.load_model(final_eval=True) if args.task == Tasks.PAWS_X.value: trainer.evaluate_pair("dev", exp_name=args.model_dir) trainer.evaluate_pair("test", exp_name=args.model_dir) else: trainer.evaluate_xnlu("dev", exp_name=args.model_dir) trainer.evaluate_xnlu("test", exp_name=args.model_dir)
def main(args): init_logger() #输出信息 tokenizer = load_tokenizer(args) # 加载预训练模型 train_dataset = load_and_cache_examples(args, tokenizer, mode="train") dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev") test_dataset = load_and_cache_examples(args, tokenizer, mode="test") trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) # if args.do_train: # trainer.train() # if args.do_eval: # trainer.load_model() # trainer.evaluate("test") @app.route('/pred_term', methods=['GET', 'POST']) def get_data(): if request.method == 'POST': argsJson = request.data.decode('utf-8') argsJson = json.loads(argsJson) (title, texts), = argsJson.items() # 结巴分词 jieba_text = " ".join(jieba.cut(texts, cut_all=False)) jieba_text = jieba_text.split() jieba_word_dict = {} for i in jieba_text: if i not in jieba_word_dict: jieba_word_dict[i] = 1 else: jieba_word_dict[i] += 1 # 术语识别 texts = " ".join(texts) texts = texts.split('。') if len(texts[-1]) == 0: texts = texts[:-1] slot_preds_list = trainer.predict(texts, tokenizer) new_texts = [] for t in texts: new_texts.append(t.strip().split()) # print(new_texts) term_weight = get_tf_idf(new_texts, slot_preds_list, jieba_word_dict) term_weight = json.dumps(term_weight, ensure_ascii=False) return term_weight else: return " 'it's not a POST operation! " if args.do_pred: trainer.load_model() # texts = read_prediction_text(args) app.run(host='0.0.0.0', port=5001)
def main(): config = Config('config.ini') init_logger() tokenizer = load_tokenizer(config) train_dataset = load_and_cache_examples(config, tokenizer, evaluate=False) test_dataset = load_and_cache_examples(config, tokenizer, evaluate=True) trainer = Trainer(config, train_dataset, test_dataset) if config.do_train: trainer.train() if config.do_eval: trainer.evaluate()
def main(args): init_logger() tokenizer = load_tokenizer(args) train_dataset = load_and_cache_examples(args, tokenizer, mode="train") dev_dataset = None test_dataset = load_and_cache_examples(args, tokenizer, mode="test") trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) if args.do_train: trainer.train() if args.do_eval: trainer.load_model() trainer.evaluate("test")
def predict(self): if self.args.local_rank in [-1, 0]: model, tokenizer = self._prepare_model(args=self.args, labels=self.labels, num_labels=self.num_labels, mode='predict', model_dir=self.args.output_dir) test_dataset = load_and_cache_examples(args=self.args, tokenizer=tokenizer, labels=self.labels, pad_token_label_id=self.pad_token_label_id, mode='test') result, predictions = self._evaluate(self.args, model, test_dataset, self.labels, self.pad_token_label_id, mode='test', prefix="") output_test_results_file = os.path.join(self.args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(self.args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: with open(os.path.join(self.args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
def main(args): ''' the main process of SRGLHRE ''' init_logger() tokenizer = load_tokenizer(args) train_dataset = load_and_cache_examples(args, tokenizer, mode="train") test_dataset = load_and_cache_examples(args, tokenizer, mode="test") trainer = Trainer(args, train_dataset=train_dataset, test_dataset=test_dataset) if args.do_train: trainer.train() if args.do_eval: trainer.load_model() trainer.evaluate('test')
def main(args): init_logger(args) set_seed(args) tokenizer = load_tokenizer(args) train_dataset = load_and_cache_examples(args, tokenizer) trainer = Trainer(args, train_dataset) if args.do_train: trainer.train()
def main(args): init_logger() set_seed(args) tokenizer = load_tokenizer(args) print("PREPROCESSING TRAIN DATA") train_dataset = load_and_cache_examples(args, tokenizer, mode="train") print("PREPROCESSING TRAIN DATA") dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev") # test_dataset = load_and_cache_examples(args, tokenizer, mode="test") print("LOAD TRAINER") trainer = Trainer(args, train_dataset, dev_dataset) print("================TRAIN==============") if args.do_train: trainer.train() if args.do_eval: trainer.load_model() trainer.evaluate("dev")
def main(args): init_logger() set_seed(args) tokenizer = load_tokenizer(args) wandb.init(project='R-BERT', name='R-BERT w/ One-Hot') train_dataset = load_and_cache_examples(args, tokenizer, mode="train") test_dataset = load_and_cache_examples(args, tokenizer, mode="test") trainer = Trainer(args, train_dataset=train_dataset, test_dataset=test_dataset) if args.do_train: trainer.train() if args.do_eval: trainer.load_model() trainer.evaluate("test")
def train(self): model, tokenizer = self._prepare_model(args=self.args, labels=self.labels, num_labels=self.num_labels, mode='train', model_dir=self.args.model_name_or_path) train_dataset = load_and_cache_examples(args=self.args, tokenizer=tokenizer, labels=self.labels, pad_token_label_id=self.pad_token_label_id, mode="train") global_step, tr_loss = self._train(self.args, train_dataset, model, tokenizer, self.labels, self.pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
def main(args): init_logger() set_seed(args) tokenizer = load_tokenizer(args) train_dataset = load_and_cache_examples(args, tokenizer, mode="train") dev_dataset = load_and_cache_examples(args, tokenizer, mode="dev") test_dataset, dataset_id = load_and_cache_examples(args, tokenizer, mode="test") trainer = Trainer(args, train_dataset, dev_dataset, test_dataset) if args.do_train: trainer.train(mode="train") if args.do_dev: trainer.train(mode="dev") if args.do_eval: trainer.load_model() results = trainer.evaluate("test") print("dataset_id : ", dataset_id) print("results : ", results) results = [[data_id, result] for (data_id, result) in zip(dataset_id, results)] print(results) write_csvFile(os.path.join(args.data_dir, "result.csv"), results)
def evaluate(self): #TODO evaluate all checkpoint는 나중에 구현 if self.args.local_rank in [-1, 0]: model, tokenizer = self._prepare_model(args=self.args, labels=self.labels, num_labels=self.num_labels, mode='eval', model_dir=self.args.output_dir) eval_dataset = load_and_cache_examples(args=self.args, tokenizer=tokenizer, labels=self.labels, pad_token_label_id=self.pad_token_label_id, mode='dev') result, _ = self._evaluate(self.args, model, eval_dataset, self.labels, self.pad_token_label_id, mode='dev', prefix="") self.results.update(result) output_eval_file = os.path.join(self.args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(self.results.keys()): writer.write("{} = {}\n".format(key, str(self.results[key])))
def evaluate(self): args = self.args eval_dataset, examples, features = load_and_cache_examples(args, self.tokenizer, evaluate=True, output_examples=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate if args.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) # Eval! if not os.path.exists(args.model_dir): raise Exception("Model doesn't exists! Train first!") model_name = (args.model_dir).split("/")[-1] print("model name: {}".format(model_name)) logger.info("***** Running evaluation {} *****".format(model_name)) logger.info(" Num examples = %d", len(eval_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) all_results = [] start_time = timeit.default_timer() for batch in tqdm(eval_dataloader, desc="Evaluating"): self.model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], } example_indices = batch[3] outputs = self.model(**inputs) for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) output = [output[i].detach().cpu().tolist() for output in outputs] start_logits, end_logits = output result = SquadResult(unique_id, start_logits, end_logits) all_results.append(result) evalTime = timeit.default_timer() - start_time logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset)) # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(model_name)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(model_name)) predictions = compute_predictions_logits( examples, features, all_results, args.n_best_size, args.max_answer_length, self.tokenizer, output_prediction_file, output_nbest_file, ) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) logger.info(results) return results
def main(cli_args): # Read from config file and make args config_filename = "{}.json".format(cli_args.taxonomy) with open(os.path.join("config", config_filename)) as f: args = AttrDict(json.load(f)) logger.info("Training/evaluation parameters {}".format(args)) args.output_dir = os.path.join(args.ckpt_dir, args.output_dir) init_logger() set_seed(args) processor = GoEmotionsProcessor(args) label_list = processor.get_labels() config = BertConfig.from_pretrained( args.model_name_or_path, num_labels=len(label_list), finetuning_task=args.task, id2label={str(i): label for i, label in enumerate(label_list)}, label2id={label: i for i, label in enumerate(label_list)}) tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name_or_path, ) model = BertForMultiLabelClassification.from_pretrained( args.model_name_or_path, config=config) # GPU or CPU args.device = "cuda" if torch.cuda.is_available( ) and not args.no_cuda else "cpu" model.to(args.device) # Load dataset train_dataset = load_and_cache_examples( args, tokenizer, mode="train") if args.train_file else None dev_dataset = load_and_cache_examples( args, tokenizer, mode="dev") if args.dev_file else None test_dataset = load_and_cache_examples( args, tokenizer, mode="test") if args.test_file else None if dev_dataset is None: args.evaluate_test_during_training = True # If there is no dev dataset, only use test dataset if args.do_train: global_step, tr_loss = train(args, model, tokenizer, train_dataset, dev_dataset, test_dataset) logger.info(" global_step = {}, average loss = {}".format( global_step, tr_loss)) results = {} if args.do_eval: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + "pytorch_model.bin", recursive=True))) if not args.eval_all_checkpoints: checkpoints = checkpoints[-1:] else: logging.getLogger("transformers.configuration_utils").setLevel( logging.WARN) # Reduce logging logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] model = BertForMultiLabelClassification.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, test_dataset, mode="test", global_step=global_step) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as f_w: for key in sorted(results.keys()): f_w.write("{} = {}\n".format(key, str(results[key])))
def _train(self, args, train_dataset, model, tokenizer, labels, pad_token_label_id): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path try: global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) except ValueError: global_step = 0 epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]} if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well #TODO 이걸 cache로 저장하는게 맞을지 매번 부르는게 맞을 지 고민 eval_dataset = load_and_cache_examples(args=self.args, tokenizer=tokenizer, labels=self.labels, pad_token_label_id=self.pad_token_label_id, mode='dev') results, _ = self._evaluate(args, model, eval_dataset, labels, pad_token_label_id, mode="dev", prefix="") for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() # Saves the last model when ended. self._save_model(args, model, tokenizer) return global_step, tr_loss / global_step
def train(self): args = self.args train_dataset = load_and_cache_examples(args, self.tokenizer, evaluate=False, output_examples=False) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) num_warmup_steps = t_total * args.warmup_proportion scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # multi-gpu training if args.n_gpu > 1: self.model = torch.nn.DataParallel(self.model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps , ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 self.model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch") # Added here for reproductibility set_seed(args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue self.model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } outputs = self.model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule self.model.zero_grad() global_step += 1 if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: self.save_model(global_step) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break self.args.model_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) self.save_model(global_step) return global_step, tr_loss / global_step