def __init__(self, args): super().__init__() embedding_dim = 768 self.roberta = RobertaForSequenceClassification.from_pretrained( 'roberta-base', return_dict=True, output_hidden_states=True) self.dense = nn.Linear(embedding_dim, embedding_dim) self.layer_norm = nn.LayerNorm(768) self.init_weights(self.dense)
def __init__(self, args: argparse.Namespace): super().__init__() self.args = args self.tokenizer = RobertaTokenizer.from_pretrained( self.args.roberta_path) self.model = RobertaForSequenceClassification.from_pretrained( self.args.roberta_path) self.loss_fn = CrossEntropyLoss() self.metric = Accuracy(num_classes=2) self.num_gpus = len(str(self.args.gpus).split(","))
def __init__( self, args: argparse.Namespace ): """Initialize a model, tokenizer and config.""" super().__init__() self.args = args if isinstance(args, argparse.Namespace): self.save_hyperparameters(args) self.bert_dir = args.bert_path self.model = RobertaForSequenceClassification.from_pretrained(self.bert_dir) self.tokenizer = RobertaTokenizer.from_pretrained(self.bert_dir) self.loss_fn = CrossEntropyLoss() self.train_acc = pl.metrics.Accuracy() self.valid_acc = pl.metrics.Accuracy()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"rte": RteProcessor} output_modes = {"rte": "classification"} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.per_gpu_train_batch_size * max(1, n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(["entailment", "neutral", "contradiction"]) # pretrain_model_dir = 'roberta-large' #'roberta-large' , 'roberta-large-mnli' pretrain_model_dir = '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_epoch_2_acc_4.156359461121103' #'roberta-large' , 'roberta-large-mnli' model = RobertaForSequenceClassification.from_pretrained( pretrain_model_dir, num_labels=num_labels) tokenizer = RobertaTokenizer.from_pretrained( pretrain_model_dir, do_lower_case=args.do_lower_case) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) #MNLI-SNLI-SciTail-RTE-SICK train_examples_MNLI, dev_examples_MNLI = processor.get_MNLI_train_and_dev( '/export/home/Dataset/glue_data/MNLI/train.tsv', '/export/home/Dataset/glue_data/MNLI/dev_mismatched.tsv' ) #train_pu_half_v1.txt train_examples_SNLI, dev_examples_SNLI = processor.get_SNLI_train_and_dev( '/export/home/Dataset/glue_data/SNLI/train.tsv', '/export/home/Dataset/glue_data/SNLI/dev.tsv') train_examples_SciTail, dev_examples_SciTail = processor.get_SciTail_train_and_dev( '/export/home/Dataset/SciTailV1/tsv_format/scitail_1.0_train.tsv', '/export/home/Dataset/SciTailV1/tsv_format/scitail_1.0_dev.tsv') train_examples_RTE, dev_examples_RTE = processor.get_RTE_train_and_dev( '/export/home/Dataset/glue_data/RTE/train.tsv', '/export/home/Dataset/glue_data/RTE/dev.tsv') train_examples_ANLI, dev_examples_ANLI = processor.get_ANLI_train_and_dev( 'train', 'dev', '/export/home/Dataset/para_entail_datasets/ANLI/anli_v0.1/') train_examples = train_examples_MNLI + train_examples_SNLI + train_examples_SciTail + train_examples_RTE + train_examples_ANLI dev_examples_list = [ dev_examples_MNLI, dev_examples_SNLI, dev_examples_SciTail, dev_examples_RTE, dev_examples_ANLI ] dev_task_label = [0, 0, 1, 1, 0] task_names = ['MNLI', 'SNLI', 'SciTail', 'RTE', 'ANLI'] '''filter challenging neighbors''' neighbor_id_list = [] readfile = codecs.open('neighbors_indices_before_dropout_eud.v3.txt', 'r', 'utf-8') for line in readfile: neighbor_id_list.append(int(line.strip())) readfile.close() print('neighbor_id_list size:', len(neighbor_id_list)) truncated_train_examples = [train_examples[i] for i in neighbor_id_list] train_examples = truncated_train_examples num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 max_dev_acc = 0.0 train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0) #4 if args.model_type in ['xlnet'] else 0,) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_task_label_ids = torch.tensor([f.task_label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_task_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True) '''dev data to features''' valid_dataloader_list = [] for valid_examples_i in dev_examples_list: valid_features = convert_examples_to_features( valid_examples_i, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) logger.info("***** valid_examples *****") logger.info(" Num examples = %d", len(valid_examples_i)) valid_input_ids = torch.tensor([f.input_ids for f in valid_features], dtype=torch.long) valid_input_mask = torch.tensor([f.input_mask for f in valid_features], dtype=torch.long) valid_segment_ids = torch.tensor( [f.segment_ids for f in valid_features], dtype=torch.long) valid_label_ids = torch.tensor([f.label_id for f in valid_features], dtype=torch.long) valid_task_label_ids = torch.tensor( [f.task_label for f in valid_features], dtype=torch.long) valid_data = TensorDataset(valid_input_ids, valid_input_mask, valid_segment_ids, valid_label_ids, valid_task_label_ids) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=args.eval_batch_size) valid_dataloader_list.append(valid_dataloader) iter_co = 0 for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, task_label_ids = batch logits = model(input_ids, input_mask, None, labels=None) prob_matrix = F.log_softmax(logits[0].view(-1, num_labels), dim=1) '''this step *1.0 is very important, otherwise bug''' new_prob_matrix = prob_matrix * 1.0 '''change the entail prob to p or 1-p''' changed_places = torch.nonzero(task_label_ids, as_tuple=False) new_prob_matrix[changed_places, 0] = 1.0 - prob_matrix[changed_places, 0] loss = F.nll_loss(new_prob_matrix, label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 # if iter_co % len(train_dataloader) ==0: if iter_co % (len(train_dataloader) // 5) == 0: ''' start evaluate on dev set after this epoch ''' # if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): # model = torch.nn.DataParallel(model) model.eval() for m in model.modules(): if isinstance(m, torch.nn.BatchNorm2d): m.track_running_stats = False # logger.info("***** Running evaluation *****") # logger.info(" Num examples = %d", len(valid_examples_MNLI)) # logger.info(" Batch size = %d", args.eval_batch_size) dev_acc_sum = 0.0 for idd, valid_dataloader in enumerate(valid_dataloader_list): task_label = dev_task_label[idd] eval_loss = 0 nb_eval_steps = 0 preds = [] gold_label_ids = [] # print('Evaluating...', task_label) # for _, batch in enumerate(tqdm(valid_dataloader, desc=task_names[idd])): for _, batch in enumerate(valid_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, task_label_ids = batch if task_label == 0: gold_label_ids += list( label_ids.detach().cpu().numpy()) else: '''SciTail, RTE''' task_label_ids_list = list( task_label_ids.detach().cpu().numpy()) gold_label_batch_fake = list( label_ids.detach().cpu().numpy()) for ex_id, label_id in enumerate( gold_label_batch_fake): if task_label_ids_list[ex_id] == 0: gold_label_ids.append(label_id) #0 else: gold_label_ids.append(1) #1 with torch.no_grad(): logits = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=None, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] pred_probs = softmax(preds, axis=1) pred_label_ids_3way = np.argmax(pred_probs, axis=1) if task_label == 0: '''3-way tasks MNLI, SNLI, ANLI''' pred_label_ids = pred_label_ids_3way else: '''SciTail, RTE''' pred_label_ids = [] for pred_label_i in pred_label_ids_3way: if pred_label_i == 0: pred_label_ids.append(0) else: pred_label_ids.append(1) assert len(pred_label_ids) == len(gold_label_ids) hit_co = 0 for k in range(len(pred_label_ids)): if pred_label_ids[k] == gold_label_ids[k]: hit_co += 1 test_acc = hit_co / len(gold_label_ids) dev_acc_sum += test_acc print(task_names[idd], ' dev acc:', test_acc) '''store the model, because we can test after a max_dev acc reached''' model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training store_transformers_models( model_to_save, tokenizer, '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/', 'RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_Filter_1_epoch_' + str(epoch_i) + '_acc_' + str(dev_acc_sum))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, #2 classes finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) '''update the roberta parameters by my 3-way model''' model_roberta = RobertaForSequenceClassification.from_pretrained( '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_Filter_1_epoch_51_acc_4.199802825942953', num_labels=3) # model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli', num_labels=3) model.roberta.load_state_dict(model_roberta.roberta.state_dict()) # Get datasets train_dataset = (GlueDataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def compute_metrics_fn(p: EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"rte": RteProcessor} output_modes = {"rte": "classification"} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] num_labels = len(["entailment", "neutral", "contradiction"]) pretrain_model_dir = 'roberta-large-mnli' #'roberta-large' , 'roberta-large-mnli' model = RobertaForSequenceClassification.from_pretrained( pretrain_model_dir, num_labels=num_labels) tokenizer = RobertaTokenizer.from_pretrained( pretrain_model_dir, do_lower_case=args.do_lower_case) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) valid_examples_MNLI, label_list_MNLI = processor.get_MNLI_as_train( '/export/home/Dataset/glue_data/MNLI/dev_mismatched.tsv') valid_features = convert_examples_to_features( valid_examples_MNLI, label_list_MNLI, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0) #4 if args.model_type in ['xlnet'] else 0,) logger.info("***** valid_examples *****") logger.info(" Num examples = %d", len(valid_examples_MNLI)) valid_input_ids = torch.tensor([f.input_ids for f in valid_features], dtype=torch.long) valid_input_mask = torch.tensor([f.input_mask for f in valid_features], dtype=torch.long) valid_segment_ids = torch.tensor([f.segment_ids for f in valid_features], dtype=torch.long) valid_label_ids = torch.tensor([f.label_id for f in valid_features], dtype=torch.long) valid_data = TensorDataset(valid_input_ids, valid_input_mask, valid_segment_ids, valid_label_ids) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=args.eval_batch_size) #MNLI-SNLI-SciTail-RTE-SICK train_examples_MNLI, label_list_MNLI = processor.get_MNLI_as_train( '/export/home/Dataset/glue_data/MNLI/train.tsv') #train_pu_half_v1.txt train_examples_SNLI, label_list_SNLI = processor.get_SNLI_as_train( '/export/home/Dataset/glue_data/SNLI/train.tsv') # train_examples_SciTail, label_list_SciTail = processor.get_SciTail_as_train('/export/home/Dataset/SciTailV1/tsv_format/scitail_1.0_train.tsv') # train_examples_RTE, label_list_RTE = processor.get_RTE_as_train('/export/home/Dataset/glue_data/RTE/train.tsv') # train_examples_SICK = processor.get_SICK_as_train('/export/home/Dataset/glue_data/RTE/train.tsv') '''iter over each dataset''' dataset_name_list = ['MNLI', 'SNLI'] dataset_list = [train_examples_MNLI, train_examples_SNLI] dataset_label_list = [label_list_MNLI, label_list_SNLI] for dataset_id, train_examples in enumerate(dataset_list): label_list = dataset_label_list[dataset_id] num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 max_dev_acc = 0.0 train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, input_mask, None, labels=None) loss_fct = CrossEntropyLoss() loss = loss_fct(logits[0].view(-1, num_labels), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 if iter_co % 500: print('loss........:', loss) if iter_co % len(train_dataloader) == 0: ''' start evaluate on MNLI dev set after this epoch ''' model.eval() logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(valid_examples_MNLI)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] gold_label_ids = [] print('Evaluating...') for input_ids, input_mask, segment_ids, label_ids in valid_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) gold_label_ids += list( label_ids.detach().cpu().numpy()) with torch.no_grad(): logits = model(input_ids, input_mask, None, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] pred_probs = softmax(preds, axis=1) pred_label_ids = np.argmax(pred_probs, axis=1) assert len(pred_label_ids) == len(gold_label_ids) hit_co = 0 for k in range(len(pred_label_ids)): if pred_label_ids[k] == gold_label_ids[k]: hit_co += 1 test_acc = hit_co / len(gold_label_ids) print('valid acc:', test_acc) '''store the model, because we can test after a max_dev acc reached''' store_transformers_models( model, tokenizer, '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/', '->'.join(dataset_name_list[:dataset_id + 1]))
def __init__(self, args): super().__init__() embedding_dim = 768 self.roberta = RobertaForSequenceClassification.from_pretrained( 'roberta-base', return_dict=True, output_hidden_states=True) #self.bert.config = sm.config #self.roberta.resize_token_embeddings(119567) #self.tie_weights() self.dense = nn.Linear(embedding_dim, embedding_dim) self.layer_norm = nn.LayerNorm(768) self.init_weights(self.dense) field = args.field if field == 'sparse_16_title': self.his_len = 16 self.set_len = 32 elif field == 'sparse_60_title': self.his_len = 60 self.set_len = 32 elif field == 'sparse_60_cat': self.his_len = 60 self.set_len = 32 elif field == 'sparse_20_cat_abs': self.his_len = 20 self.set_len = 96 elif field == 'sparse_120_title': self.his_len = 120 self.set_len = 32 elif field == 'sparse_120_cat': self.his_len = 120 self.set_len = 32 elif field == 'sparse_40_cat_abs': self.his_len = 40 self.set_len = 96 elif field == 'sparse_60_cat_abs': self.his_len = 60 self.set_len = 96 elif field == 'sparse_60_title_last': self.his_len = 60 self.set_len = 32 elif field == 'sparse_60_cat_last': self.his_len = 60 self.set_len = 32 elif field == 'sparse_80_title_reverse': self.his_len = 80 self.set_len = 32 elif field == 'sparse_80_title_non_reverse': self.his_len = 80 self.set_len = 32 elif field == 'sparse_16_title_reverse': self.his_len = 16 self.set_len = 32 elif field == 'sparse_16_title_non_reverse': self.his_len = 16 self.set_len = 32 sm = SparseRobertaForSequenceClassification.from_pretrained( 'roberta-base', return_dict=True, output_hidden_states=True) if 'reverse' in field: sm.make_long_and_sparse( self.his_len * self.set_len, "variable", 16, False, [32] * int(self.set_len * self.his_len / 512), [0]) self.atten_mask = torch.zeros( (self.his_len * self.set_len, self.his_len * self.set_len)) elif 'last' in field: sm.make_long_and_sparse( self.his_len * self.set_len + 10 * 64, "longformer", 16, True, self.set_len, list( range(0, int(self.set_len / 16) * self.his_len, int(self.set_len / 16)))) self.atten_mask = torch.zeros( (self.his_len * self.set_len + 10 * 64, self.his_len * self.set_len + 10 * 64)) else: sm.make_long_and_sparse( self.his_len * self.set_len, "longformer", 16, True, self.set_len, list( range(0, int(self.set_len / 16) * self.his_len, int(self.set_len / 16)))) self.atten_mask = torch.zeros( (self.his_len * self.set_len, self.his_len * self.set_len)) self.sparse_roberta = sm.roberta self.atten_mask[0, :] = 1 if 'non_reverse' in field: self.atten_mask[:, 0] = 1 # self.atten_mask[0:512,0:512]=1 # self.atten_mask[512:1024,512:1024]=1 # self.atten_mask[1024:1536,1024:1536]=1 # self.atten_mask[1536:2048,1536:2048]=1 # self.atten_mask[2048:2560,2048:2560]=1 for item in range(0, self.set_len * self.his_len, 512): self.atten_mask[item:item + 512, item:item + 512] = 1 elif 'reverse' in field: self.atten_mask = None # if 'reverse' in field: # self.atten_mask[:,0]=1 # for item in range(0,self.set_len*self.his_len,512): # self.atten_mask[item:item+512,item:item+512]=1 elif 'last' not in field: self.atten_mask[:, 1] = 1 for item in range(0, int(self.set_len / 16) * self.his_len, int(self.set_len / 16)): start = item * 16 # self.atten_mask[start,:]=1#global self.atten_mask[:, start] = 1 for item in range(self.his_len): self.atten_mask[item * self.set_len:(item + 1) * self.set_len, item * self.set_len:(item + 1) * self.set_len] = 1 else: self.atten_mask[:, 1] = 1 for item in range(0, int(self.set_len / 16) * (self.his_len - 10), int(self.set_len / 16)): start = item * 16 self.atten_mask[:, start] = 1 for item in range( int(self.set_len / 16) * (self.his_len - 10), int(self.set_len / 16) * (self.his_len - 10) + int( (self.set_len + 64) / 16) * 10, int((self.set_len + 64) / 16)): start = item * 16 self.atten_mask[:, start] = 1 for item in range(self.his_len - 10): self.atten_mask[item * self.set_len:(item + 1) * self.set_len, item * self.set_len:(item + 1) * self.set_len] = 1 for item in range(self.his_len - 10, self.his_len): start = 50 * self.set_len + (item - 50) * (self.set_len + 64) end = start + (self.set_len + 64) self.atten_mask[start:end, start:end] = 1 #print('????',start,end,self.atten_mask[start:end,start:end]) self.field = field
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"rte": RteProcessor} output_modes = {"rte": "classification"} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.per_gpu_train_batch_size * max(1, n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, n_gpu) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = [0, 1] num_labels = len(label_list) pretrain_model_dir = 'roberta-large' #'roberta-large' , 'roberta-large-mnli' # pretrain_model_dir = '/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_epoch_2_acc_4.156359461121103' #'roberta-large' , 'roberta-large-mnli' model = RobertaForSequenceClassification.from_pretrained( pretrain_model_dir, num_labels=num_labels) tokenizer = RobertaTokenizer.from_pretrained( pretrain_model_dir, do_lower_case=args.do_lower_case) '''update the roberta parameters by my 3-way model''' # model_roberta = RobertaForSequenceClassification.from_pretrained('/export/home/Dataset/BERT_pretrained_mine/TrainedModelReminder/RoBERTa_on_MNLI_SNLI_SciTail_RTE_ANLI_SpecialToken_Filter_1_epoch_51_acc_4.199802825942953', num_labels=3) # model_roberta = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli', num_labels=3) # model.roberta.load_state_dict(model_roberta.roberta.state_dict()) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # processor.prepare_MRPC_labeled_set() # exit(0) train_examples, dev_examples, test_examples = processor.get_MRPC( '/export/home/Dataset/glue_data/MRPC/') # train_examples = train_examples_MNLI+train_examples_SNLI+train_examples_SciTail+train_examples_RTE+train_examples_ANLI dev_examples_list = [dev_examples, test_examples] num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 max_dev_acc = 0.0 train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0) #4 if args.model_type in ['xlnet'] else 0,) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) # all_task_label_ids = torch.tensor([f.task_label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, drop_last=True) '''dev data to features''' valid_dataloader_list = [] for valid_examples_i in dev_examples_list: valid_features = convert_examples_to_features( valid_examples_i, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end= False, #bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0, #2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra= True, #bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left= False, #bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], pad_token_segment_id=0 ) #4 if args.model_type in ['xlnet'] else 0,) logger.info("***** valid_examples *****") logger.info(" Num examples = %d", len(valid_examples_i)) valid_input_ids = torch.tensor([f.input_ids for f in valid_features], dtype=torch.long) valid_input_mask = torch.tensor([f.input_mask for f in valid_features], dtype=torch.long) valid_segment_ids = torch.tensor( [f.segment_ids for f in valid_features], dtype=torch.long) valid_label_ids = torch.tensor([f.label_id for f in valid_features], dtype=torch.long) # valid_task_label_ids = torch.tensor([f.task_label for f in valid_features], dtype=torch.long) valid_data = TensorDataset(valid_input_ids, valid_input_mask, valid_segment_ids, valid_label_ids) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=args.eval_batch_size) valid_dataloader_list.append(valid_dataloader) iter_co = 0 max_dev_acc = 0.0 max_dev_f1 = 0.0 max_test_acc = 0.0 max_test_f1 = 0.0 for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, input_mask, None, labels=None) prob_matrix = F.log_softmax(logits[0].view(-1, num_labels), dim=1) loss = F.nll_loss(prob_matrix, label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 if iter_co % len(train_dataloader) == 0: # if iter_co % (len(train_dataloader)//5) ==0: ''' start evaluate on dev set after this epoch ''' # if n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): # model = torch.nn.DataParallel(model) model.eval() dev_acc_sum = 0.0 for idd, valid_dataloader in enumerate(valid_dataloader_list): preds = [] gold_label_ids = [] for _, batch in enumerate(valid_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch gold_label_ids += list( label_ids.detach().cpu().numpy()) with torch.no_grad(): logits = model(input_ids=input_ids, attention_mask=input_mask, token_type_ids=None, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] pred_probs = softmax(preds, axis=1) pred_label_ids = np.argmax(pred_probs, axis=1) assert len(pred_label_ids) == len(gold_label_ids) hit_co = 0 overlap = 0 for k in range(len(pred_label_ids)): if pred_label_ids[k] == gold_label_ids[k]: hit_co += 1 if gold_label_ids[k] == 1: overlap += 1 test_acc = hit_co / len(gold_label_ids) precision = overlap / (1e-6 + sum(pred_label_ids)) recall = overlap / (1e-6 + sum(gold_label_ids)) f1 = 2 * precision * recall / (precision + recall + 1e-6) if idd == 0: # is dev if f1 > max_dev_f1: max_dev_f1 = f1 if test_acc > max_dev_acc: max_dev_acc = test_acc print('\ncurrent dev f1:', f1, ' acc:', test_acc, ' max dev f1:', max_dev_f1, 'max_dev_acc:', max_dev_acc) else: print('\ncurrent dev f1:', f1, ' acc:', test_acc, ' max dev f1:', max_dev_f1, 'max_dev_acc:', max_dev_acc) break else: # test if f1 > max_test_f1: max_test_f1 = f1 if test_acc > max_test_acc: max_test_acc = test_acc print('\ncurrent test f1:', f1, ' acc:', test_acc, ' max test f1:', max_test_f1, 'max_test_acc:', max_test_acc) else: print('\ncurrent test f1:', f1, ' acc:', test_acc, ' max test f1:', max_test_f1, 'max_test_acc:', max_test_acc)