def __init__(self, args): torch.manual_seed(args.seed) self.args = args # Tokenizer, Generator, Discriminator if args.load_epoch > -1: # NOTE: 0-indexed. Load from trained gen_path, dis_path = get_gan_path(self.args.model_out, self.args.load_epoch) else: gen_path, dis_path = args.bert_model, args.bert_model self.tokenizer = BertTokenizer.from_pretrained( gen_path) # TODO requires_grad = False? self.generator = BertForMaskedLM.from_pretrained(gen_path) self.discriminator = BertForSequenceClassification.from_pretrained( dis_path, num_labels=self.args.num_labels) # Optimizer self.optimizerG = self._get_optimizer_(self.generator) self.optimizerD = self._get_optimizer_(self.discriminator) # DataLoader self.msk_data = load_data(args.data_in, args.maxlen, args.batch_size, self.tokenizer, args.seed, 'masked') self.org_data = load_data(args.data_in, args.maxlen, args.batch_size, self.tokenizer, args.seed, 'original') self.mask_id = self.tokenizer.convert_tokens_to_ids(['[MASK]'])[0] self.device = torch.device("cuda:0" if args.cuda else "cpu") self.generator.to(self.device) self.discriminator.to(self.device)
def main(): batch_size = 16 max_seq_len = 128 model_dir = 'fine_tuned--bert-base-uncased--SEQ_LEN=128--BATCH_SIZE=32--HEAD=1' output_filename = os.path.join( model_dir, "fine-tuned-sent-classifer-test-results.csv") test_sets_dir = "dataset\custom_test_set" test_files = [filename for filename in os.listdir(test_sets_dir)] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = BertTokenizer.from_pretrained(model_dir) model = BertForSequenceClassification.from_pretrained(model_dir) model.to(device) criterion = Softmax() accuracies = {} for filename in test_files: print("Testing on dataset: {}".format(filename)) file_path = os.path.join(test_sets_dir, filename) test_dataset = Dataset(file_path, tokenizer, max_seq_len) test_dataloader = data.DataLoader(test_dataset, batch_size=batch_size) accuracy = 0 for batch in test_dataloader: with torch.no_grad(): batch = (t.to(device) for t in batch) input_ids, input_mask, segment_ids, labels = batch outputs = model(input_ids, input_mask, segment_ids) logits = outputs[0] _, predictions = criterion(logits).max(-1) results = predictions == labels accuracy += results.sum().item() accuracy = accuracy / len(test_dataset) print("Model achieved {}'%' accuracy".format(accuracy)) dataset_name = filename.split('.')[0] accuracies[dataset_name] = accuracy with open(output_filename, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=accuracies.keys()) writer.writeheader() writer.writerow(accuracies)
def bertForSequenceClassification(*args, **kwargs): """ BertForSequenceClassification is a fine-tuning model that includes BertModel and a sequence-level (sequence or pair of sequences) classifier on top of the BertModel. Note that the classification head is only initialized and has to be trained. The sequence-level classifier is a linear layer that takes as input the last hidden state of the first character in the input sequence (see Figures 3a and 3b in the BERT paper). Args: num_labels: the number (>=2) of classes for the classifier. Example: # Load the tokenizer >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> tokenized_text = tokenizer.tokenize(text) >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] >>> tokens_tensor = torch.tensor([indexed_tokens]) >>> segments_tensors = torch.tensor([segments_ids]) # Load bertForSequenceClassification >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2) >>> model.eval() # Predict the sequence classification logits >>> with torch.no_grad(): seq_classif_logits = model(tokens_tensor, segments_tensors) # Or get the sequence classification loss >>> labels = torch.tensor([1]) >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss """ model = BertForSequenceClassification.from_pretrained(*args, **kwargs) return model
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=256, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, # "mnli-mm": MnliMismatchedProcessor, # "mrpc": MrpcProcessor, # "sst-2": Sst2Processor, # "sts-b": StsbProcessor, # "qqp": QqpProcessor, # "qnli": QnliProcessor, "rte": RteProcessor # "wnli": WnliProcessor, } output_modes = { # "cola": "classification", # "mnli": "classification", # "mrpc": "classification", # "sst-2": "classification", # "sts-b": "regression", # "qqp": "classification", # "qnli": "classification", "rte": "classification" # "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples, seen_types = processor.get_examples_Wikipedia_train('/export/home/Dataset/wikipedia/parsed_output/tokenized_wiki/tokenized_wiki2categories.txt', 100000) #train_pu_half_v1.txt # seen_classes=[0,2,4,6,8] num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format(args.local_rank)) # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=cache_dir, # num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) pretrain_model_dir = 'bert-base-uncased' #FineTuneOnCombined'# FineTuneOnMNLI model = BertForSequenceClassification.from_pretrained(pretrain_model_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_unseen_acc = 0.0 max_dev_unseen_acc = 0.0 max_dev_seen_acc = 0.0 max_overall_acc = 0.0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) '''load dev set''' eval_examples, eval_label_list, eval_hypo_seen_str_indicator, eval_hypo_2_type_index = processor.get_examples_situation_test('/export/home/Dataset/LORELEI/zero-shot-split/dev.txt', seen_types) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) eval_all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) eval_all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) eval_all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) eval_all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(eval_all_input_ids, eval_all_input_mask, eval_all_segment_ids, eval_all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) '''load test set''' test_examples, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index = processor.get_examples_situation_test('/export/home/Dataset/LORELEI/zero-shot-split/test.txt', seen_types) test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer, output_mode) test_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) test_all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) # print('train all_label_ids:', all_label_ids) # exit(0) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() loss = loss_fct(logits[0].view(-1, num_labels), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 iter_co+=1 if iter_co %200==0: ''' start evaluate on dev set after this epoch ''' model.eval() logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] print('Evaluating...') for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = logits[0] loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] ''' preds: size*2 (entail, not_entail) wenpeng added a softxmax so that each row is a prob vec ''' pred_probs = softmax(preds,axis=1)[:,0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if preds[i][0]>preds[i][1]+0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if preds[i][0]>preds[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_situation_zeroshot_TwpPhasePred(pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, eval_label_list, eval_hypo_seen_str_indicator, eval_hypo_2_type_index, seen_types) # result = compute_metrics('F1', preds, all_label_ids.numpy()) loss = tr_loss/nb_tr_steps if args.do_train else None # test_acc = mean_f1#result.get("f1") if unseen_acc > max_dev_unseen_acc: max_dev_unseen_acc = unseen_acc print('\ndev seen_f1 & unseen_f1:', seen_acc,unseen_acc, ' max_dev_unseen_f1:', max_dev_unseen_acc, '\n') # if seen_acc+unseen_acc > max_overall_acc: # max_overall_acc = seen_acc + unseen_acc # if seen_acc > max_dev_seen_acc: # max_dev_seen_acc = seen_acc ''' start evaluate on test set after this epoch ''' model.eval() logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) test_loss = 0 nb_test_steps = 0 preds = [] print('Testing...') for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps preds = preds[0] pred_probs = softmax(preds,axis=1)[:,0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if preds[i][0]>preds[i][1]+0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if preds[i][0]>preds[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_situation_zeroshot_TwpPhasePred(pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index, seen_types) # result = compute_metrics('F1', preds, all_label_ids.numpy()) # loss = tr_loss/nb_tr_steps if args.do_train else None # test_acc = mean_f1#result.get("f1") if unseen_acc > max_test_unseen_acc: max_test_unseen_acc = unseen_acc print('\n\n\t test seen_f1 & unseen_f1:', seen_acc,unseen_acc, ' max_test_unseen_f1:', max_test_unseen_acc, '\n')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_test", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) # Prepare model model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if args.do_eval and (step + 1) % ( args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) if args.do_test: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev'), ('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False)
def train(self): model = BertForSequenceClassification.from_pretrained( self.args.model_name_or_path, self.args, config=self.config) model.to(self.device) logger.info('准备数据') data = DATABDCI( debug=False, data_dir='/home/lsy2018/文本匹配/DATA/DATA_BDCI/', data_process_output='/home/lsy2018/文本匹配/DATA/DATA_BDCI/data_1014/') train_examples = data.read_examples( os.path.join(self.data_process_output, 'train.csv')) train_features = data.convert_examples_to_features( train_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # 这步干嘛的? train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size // self.gradient_accumulation_steps) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) best_acc = 0 global_step = 0 model.train() train_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(self.train_steps), total=self.train_steps) train_dataloader = cycle(train_dataloader) for step in bar: batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) train_loss += loss.item() train_loss = round( train_loss * self.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if self.do_eval and (step + 1) % ( self.eval_steps * self.gradient_accumulation_steps) == 0: inference_labels = [] scores = [] gold_labels = [] inference_logits = [] eval_examples = data.read_examples( os.path.join(self.data_process_output, 'dev.csv')) eval_features = data.convert_examples_to_features( eval_examples, self.tokenizer, self.max_seq_length) ID1 = [x.sentence_ID1 for x in eval_examples] ID2 = [x.sentence_ID2 for x in eval_examples] all_input_ids = torch.tensor(data.select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 count = 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: # ID1_list_eachbatch = ID1[count*args.eval_batch_size:(count+1)*args.eval_batch_size] # ID2_list_eachbatch = ID2[count * args.eval_batch_size:(count + 1) * args.eval_batch_size] input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) # scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) # scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) # eval_mrr = compute_MRR(scores, gold_labels, ID1, ID2) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, # 'mrr':eval_mrr, 'loss': train_loss } output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def main(): parser = argparse.ArgumentParser() ## Required parameters(即required=True的参数必须在命令上出现) parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "数据集路径. The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="模型类型(这里为bert). Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "下载好的预训练模型. Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "模型预测和断点文件的存放路径. The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help= "预训练的配置名字或路径. Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help= "预训练分词器名字或路径. Pretrained tokenizer name or path if not the same as model_name" ) parser.add_argument( "--cache_dir", default="", type=str, help= "从亚马逊s3下载的预训练模型存放路径. Where do you want to store the pre-trained models downloaded from s3" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "最长序列长度. The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="是否训练. Whether to run training.") parser.add_argument("--do_test", action='store_true', help="是否测试. Whether to run testing.") parser.add_argument("--predict_eval", action='store_true', help="是否预测验证集. Whether to predict eval set.") parser.add_argument("--do_eval", action='store_true', help="是否验证. Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="是否训练中跑验证. Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="是否用小写模型. Set this flag if you are using an uncased model.") parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="训练时每个GPU/CPU上的batch size. Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="验证时每个GPU/CPU上的batch size. Batch size per GPU/CPU for evaluation." ) parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "反向传播前梯度累计的次数. Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="Adam的初始学习率. The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="权重衰减系数. Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Adam的Epsilon系数. Epsilon for Adam optimizer.") parser.add_argument( "--max_grad_norm", default=1.0, type=float, help= " 如果所有参数的gradient组成的向量的L2 norm大于max norm,那么需要根据L2 norm/max_norm进行缩放。从而使得L2 norm小于预设的clip_norm. Max gradient norm." ) parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="训练epoch数. Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=-1, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--lstm_dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument( "--warmup_steps", default=0, type=int, help="线性warmup的steps. Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="测试集划分. text split") parser.add_argument('--logging_steps', type=int, default=50, help="日志更新steps. Log every X updates steps.") parser.add_argument( '--save_steps', type=int, default=50, help="断点文件保存steps. Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "评估所有的断点. Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="不用cuda. Avoid using CUDA when available") parser.add_argument( '--overwrite_output_dir', action='store_true', help="重写输出路径. Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="重写训练和评估的缓存. Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="初始化用的随机种子. random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "是否用16位混合精度. Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "fp16的优化level. For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="为了分布式训练. For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="远程debug用的ip. For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="远程debug用的端口. For distant debugging.") parser.add_argument("--freeze", default=0, type=int, required=False, help="冻结BERT. freeze bert.") parser.add_argument("--not_do_eval_steps", default=0.35, type=float, help="not_do_eval_steps.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: # 如果无指定GPU或允许使用CUDA,就使用当前所有GPU device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # 指定使用哪个GPU(local_rank代表当前程序进程使用的GPU标号) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging 初始化日志 logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed 设置种子数 set_seed(args) # 创建存放路径 try: os.makedirs(args.output_dir) except: pass # 载入预训练好的BERT分词器 tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # 载入预设好的BERT配置文件 config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=2) # Prepare model 载入并配置好基于BERT的序列分类模型 model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) # 开启FP16 if args.fp16: model.half() model.to(device) # 如果是指定了单个GPU,用DistributedDataParallel进行GPU训练 if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) # 如果有多个GPU,就直接用torch.nn.DataParallel,会自动调用当前可用的多个GPU elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # 总batch size = GPU数量 * 每个GPU上的mbatch size args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train: # Prepare data loader 导入数据并准备符合格式的输入 train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # 如果无指定GPU就随机采样,如果指定了GPU就分布式采样 if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) # 准备dataloader train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) # 训练steps num_train_optimization_steps = args.train_steps # Prepare optimizer 准备优化器 param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] # no_dacay内的参数不参与权重衰减 # BN是固定C,[B,H,W]进行归一化处理(处理为均值0,方差1的正太分布上),适用于CNN # LN是固定N,[C,H,W]进行归一化处理,适用于RNN(BN适用于固定深度的前向神经网络,而RNN因输入序列长度不一致而深度不固定,因此BN不合适,而LN不依赖于batch的大小和输入sequence的深度,因此可以用于batchsize为1和RNN中对边长的输入sequence的normalize操作) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # 配置优化器和warmup机制 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps // args.gradient_accumulation_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # 循环遍历 # 先做一个eval for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data 准备验证集的dataloader eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # 开启预测模式(不用dropout和BN) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: # 将数据放在GPU上 input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) # 禁止进行梯度更新 with torch.no_grad(): tmp_eval_loss, logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps # 计算验证集的预测损失 eval_accuracy = accuracy(inference_logits, gold_labels) # 计算验证集的预测准确性 result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step } # 将验证集的预测评价写入到evel_results.txt中 output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') # 如果当前训练的模型表现最佳,则保存该模型 if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) model.train() # 分batch循环迭代训练模型 for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) nb_tr_examples += input_ids.size(0) del input_ids, input_mask, segment_ids, label_ids if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_steps += 1 # 用FP16去做反向传播 if args.fp16: optimizer.backward(loss) else: loss.backward() # 梯度累计后进行更新 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() # 梯度更新 scheduler.step() # 梯度更新 optimizer.zero_grad() # 清空现有梯度,避免累计 global_step += 1 # 每隔args.eval_steps*args.gradient_accumulation_steps,打印训练过程中的结果 if (step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # 每隔args.eval_steps*args.gradient_accumulation_steps,预测验证集并评估结果 if args.do_eval and step > num_train_optimization_steps * args.not_do_eval_steps and ( step + 1) % (args.eval_steps * args.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80) # 预测测试集 if args.do_test: del model gc.collect() # 清理内存 args.do_train = False # 停止训练 # 载入训练好的的最佳模型文件 model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: # nn.Module中的half()方法将模型中的float32转化为float16 model.half() model.to(device) # 将模型放在GPU上 # 设置GPU训练方式 if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) # 预测验证集和测试集 for file, flag in [('dev.csv', 'dev'), ('CSC_test.csv', 'CSC_test'), ('NS_test.csv', 'NS_test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) # 保存预测结果文件 if flag == 'CSC_test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['qid', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_CSC.csv"), index=False) if flag == 'NS_test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['qid', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_NS.csv"), index=False) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False) # 只预测验证集 if args.predict_eval: del model gc.collect() args.do_train = False model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('dev.csv', 'dev')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) print(flag, accuracy(logits, gold_labels)) if flag == 'dev': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df[['label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub_dev.csv"), index=False)
def train(**kwargs): # kwargs.update({'model': 'CNN'}) opt.parse(kwargs) if (opt.use_gpu): torch.cuda.set_device(opt.gpu_id) if opt.encoder == 'BERT': encoder_model = BertForSequenceClassification.from_pretrained( "./downloaded_weights/downloaded_bert_base_uncased", num_labels=opt.rel_num) # print(encoder_model) opt.encoder_out_dimension = opt.rel_num else: encoder_model = getattr(encoder_models, opt.encoder)(opt) opt.encoder_out_dimension = encoder_model.out_dimension selector_model = getattr(selector_models, opt.selector)(opt) # encoder_model = torch.nn.DataParallel(encoder_model, device_ids=[3,6]) if (opt.use_gpu): encoder_model = encoder_model.cuda() selector_model = selector_model.cuda() # Loading data DataModel = getattr(dataset, opt.data + 'Data') train_data = DataModel(opt.data_root, train=True, use_bert=opt.use_bert_tokenizer) train_data_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, collate_fn=collate_fn) print('train data: {}'.format(len(train_data))) test_data = DataModel(opt.data_root, train=False, use_bert=opt.use_bert_tokenizer) test_data_loader = DataLoader(test_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, collate_fn=collate_fn) print('test data: {}'.format(len(test_data))) criterion = nn.CrossEntropyLoss() if opt.encoder == 'BERT': optimizer = AdamW( [{ 'params': encoder_model.parameters() }, { 'params': selector_model.parameters() }], lr=opt.lr, correct_bias=True ) # To reproduce BertAdam specific behavior set correct_bias=False else: optimizer = optim.Adadelta([{ 'params': encoder_model.parameters() }, { 'params': selector_model.parameters() }], lr=opt.lr, rho=1.0, eps=1e-6, weight_decay=opt.weight_decay) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=2, t_total=3) # PyTorch scheduler ### and used like this: # for batch in train_data: # loss = model(batch) # loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue) # optimizer.zero_grad() # if opt.encoder == "BERT" and False: # optimizer = optim.SGD([ # {'params': selector_model.parameters()} # ], lr=opt.lr) # else: optimizer = optim.SGD([{ 'params': encoder_model.parameters() }, { 'params': selector_model.parameters() }], lr=opt.lr) max_pre = 0.0 max_rec = 0.0 for epoch in range(opt.num_epochs): # if opt.encoder == "BERT": encoder_model.train() selector_model.train() print("*" * 50) print("Epoch {}".format(epoch)) total_loss = 0 max_insNum = 0 for batch_num, (data, label_set) in enumerate(train_data_loader): # if (batch_num>2000): # break # label_set is the label of each bag (there may be no more than 4 labels, but we only wants the first) labels = [] outs = torch.empty([0, 53]) empty = True # if all labels of bags in one batch are zeros, then it's empty, continue to avoid error for l in label_set: if (l[0] != 0): labels.append(l[0]) empty = False if empty: continue # labels = [l[0] for l in label_set] # Each time enters {batch_size} bags # Each time I want one bag!! # The model need to give me a representation of an instance!!! if opt.use_gpu: labels = torch.LongTensor(labels).cuda() outs = outs.cuda() else: labels = torch.LongTensor(labels) optimizer.zero_grad() train_cor = 0 for idx, bag in enumerate(data): insNum = bag[1] # if insNum > max_insNum: # max_insNum = insNum # print(max_insNum) label = label_set[idx][0] # Label of the current bag if (label_set[idx][0] == 0): continue ins_outs = torch.empty(0) instances = bag[2] pf_list = [] mask_list = [] if opt.encoder != 'BERT': pf_list = bag[3] mask_list = bag[5] # pf_list = bag[3] ins_out = torch.empty(0) encoder_model.batch_size = insNum if opt.use_gpu: instances = torch.LongTensor(instances).cuda() if opt.encoder == 'BERT': # with torch.no_grad(): # print(instances.size(0)) if insNum > opt.max_sentence_in_bag: ins_outs = encoder_model( instances[:opt.max_sentence_in_bag])[0] else: ins_outs = encoder_model(instances)[0] # ins_outs = ins_outs[0] # print(ins_outs[0].size()) else: for idx, instance in enumerate(instances): if opt.use_gpu: pfs = torch.LongTensor(pf_list[idx]).cuda() masks = torch.LongTensor(mask_list[idx]).cuda() else: pfs = torch.LongTensor(pf_list[idx]) masks = torch.LongTensor(mask_list[idx]) if opt.encoder == 'PCNN': ins_out = encoder_model(instance, pfs, masks) else: ins_out = encoder_model(instance, pfs) if (opt.use_gpu): ins_out = ins_out.cuda() ins_outs = ins_outs.cuda() ins_outs = torch.cat((ins_outs, ins_out), 0) del instance, ins_out if idx >= opt.max_sentence_in_bag: break bag_feature = selector_model(ins_outs) if opt.use_gpu: bag_feature = bag_feature.cuda() if (torch.max(bag_feature.squeeze(), 0)[1] == label): train_cor += 1 outs = torch.cat((outs, bag_feature), 0) del ins_outs, bag_feature # outs = outs.squeeze() # print("outs.size(): ", outs.size(), '\n', "labels.size(): ", labels.size()) # print(outs,labels) loss = criterion(outs, labels) total_loss += loss.item() avg_loss = total_loss / (batch_num + 1) sys.stdout.write( "\rbatch number: {:6d}\tloss: {:7.4f}\ttrain_acc: {:7.2f}\t". format(batch_num, avg_loss, train_cor / len(labels))) sys.stdout.flush() # sys.stdout.write('\033') loss.backward() if opt.encoder == 'BERT': scheduler.step() optimizer.step() del outs, labels if (opt.skip_predict != True): with torch.no_grad(): predict(encoder_model, selector_model, test_data_loader) t = time.strftime('%m_%d_%H_%M.pth') torch.save(encoder_model.state_dict(), 'checkpoints/{}_{}'.format(opt.encoder, t)) torch.save(selector_model.state_dict(), 'checkpoints/{}_{}'.format(opt.selector, t))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=256, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, # "mnli-mm": MnliMismatchedProcessor, # "mrpc": MrpcProcessor, # "sst-2": Sst2Processor, # "sts-b": StsbProcessor, # "qqp": QqpProcessor, # "qnli": QnliProcessor, "rte": RteProcessor # "wnli": WnliProcessor, } output_modes = { # "cola": "classification", # "mnli": "classification", # "mrpc": "classification", # "sst-2": "classification", # "sts-b": "regression", # "qqp": "classification", # "qnli": "classification", "rte": "classification" # "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) train_examples = None # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format( args.local_rank)) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_unseen_acc = 0.0 max_dev_unseen_acc = 0.0 max_dev_seen_acc = 0.0 max_overall_acc = 0.0 '''load test set''' seen_types = set() test_examples, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index = processor.get_examples_emotion_test( '/export/home/Dataset/Stuttgart_Emotion/unify-emotion-datasets-master/zero-shot-split/test.txt', seen_types) test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, BertTokenizer.from_pretrained( '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnRTE', do_lower_case=args.do_lower_case), output_mode) test_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) test_all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) ''' start evaluate on test set after this epoch ''' modelpaths = [ '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnRTE', '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnMNLI', '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnFEVER' ] pred_probs_ensemble = 0.0 for i, modelpath in enumerate(modelpaths): # pretrain_model_dir = '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnRTE' #FineTuneOnCombined'# FineTuneOnMNLI model = BertForSequenceClassification.from_pretrained( modelpath, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( modelpath, do_lower_case=args.do_lower_case) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) model.eval() logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) test_loss = 0 nb_test_steps = 0 preds = [] print('Testing...') for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) # print('preds:', preds) preds = preds[0] pred_probs_i = softmax(preds, axis=1)[:, 0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if preds[i][0] > preds[i][1] + 0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if preds[i][0] > preds[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_emotion_zeroshot_TwpPhasePred( pred_probs_i, pred_binary_labels_harsh, pred_binary_labels_loose, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index, seen_types) print('seen:', seen_acc, 'unseen:', unseen_acc) print('\n\n this model preds over\n\n\n') if i == 0: pred_probs_ensemble = softmax(preds, axis=1) else: pred_probs_ensemble += softmax(preds, axis=1) pred_probs_ensemble = softmax(pred_probs_ensemble, axis=1) pred_probs = pred_probs_ensemble[:, 0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if pred_probs_ensemble[i][0] > pred_probs_ensemble[i][1] + 0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if pred_probs_ensemble[i][0] > pred_probs_ensemble[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_emotion_zeroshot_TwpPhasePred( pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index, seen_types) if unseen_acc > max_test_unseen_acc: max_test_unseen_acc = unseen_acc print('\n\n\t test seen_f1 & unseen_f1:', seen_acc, unseen_acc, ' max_test_unseen_f1:', max_test_unseen_acc, '\n')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--meta_path", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--classifier', default='guoday', type=str, required=True, help='classifier type, guoday or MLP or GRU_MLP or ...') parser.add_argument('--optimizer', default='RAdam', type=str, required=True, help='optimizer we use, RAdam or ...') parser.add_argument("--do_label_smoothing", default='yes', type=str, required=True, help="Whether to do label smoothing. yes or no.") parser.add_argument('--draw_loss_steps', default=1, type=int, required=True, help='training steps to draw loss') parser.add_argument('--label_name', default='label', type=str, required=True, help='label name in original train set index') ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_test", default='yes', type=str, required=True, help="Whether to run training. yes or no.") parser.add_argument("--do_eval", default='yes', type=str, required=True, help="Whether to run eval on the dev set. yes or no.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--eval_steps", default=200, type=int, help="") parser.add_argument("--lstm_hidden_size", default=300, type=int, help="") parser.add_argument("--lstm_layers", default=2, type=int, help="") parser.add_argument("--dropout", default=0.5, type=float, help="") parser.add_argument("--train_steps", default=-1, type=int, help="") parser.add_argument("--report_steps", default=-1, type=int, help="") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--split_num", default=3, type=int, help="text split") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) try: os.makedirs(args.output_dir) except: pass tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) # tensorboard_log_dir = args.output_dir # loss_now = tf.placeholder(dtype=tf.float32, name='loss_now') # loss_mean = tf.placeholder(dtype=tf.float32, name='loss_mean') # loss_now_variable = loss_now # loss_mean_variable = loss_mean # train_loss = tf.summary.scalar('train_loss', loss_now_variable) # dev_loss_mean = tf.summary.scalar('dev_loss_mean', loss_mean_variable) # merged = tf.summary.merge([train_loss, dev_loss_mean]) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=3) config.hidden_dropout_prob = args.dropout # Prepare model if args.do_train == 'yes': model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, args, config=config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) if args.do_train == 'yes': print( '________________________now training______________________________' ) # Prepare data loader train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training=True, label_name=args.label_name) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, args.split_num, True) # print('train_feature_size=', train_features.__sizeof__()) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # print('train_data=',train_data[0]) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) num_train_optimization_steps = args.train_steps # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.optimizer == 'RAdam': optimizer = RAdam(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 model.train() tr_loss = 0 loss_batch = 0 nb_tr_examples, nb_tr_steps = 0, 0 bar = tqdm(range(num_train_optimization_steps), total=num_train_optimization_steps) train_dataloader = cycle(train_dataloader) # with tf.Session() as sess: # summary_writer = tf.summary.FileWriter(tensorboard_log_dir, sess.graph) # sess.run(tf.global_variables_initializer()) list_loss_mean = [] bx = [] eval_F1 = [] ax = [] for step in bar: batch = next(train_dataloader) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() loss_batch += loss.item() train_loss = round( tr_loss * args.gradient_accumulation_steps / (nb_tr_steps + 1), 4) bar.set_description("loss {}".format(train_loss)) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: # optimizer.backward(loss) loss.backward() else: loss.backward() # draw loss every n docs if (step + 1) % int(args.draw_loss_steps / (args.train_batch_size / args.gradient_accumulation_steps)) == 0: list_loss_mean.append(round(loss_batch, 4)) bx.append(step + 1) plt.plot(bx, list_loss_mean, label='loss_mean', linewidth=1, color='b', marker='o', markerfacecolor='green', markersize=2) plt.savefig(args.output_dir + '/labeled.jpg') loss_batch = 0 # paras update every batch data. if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 # report results every 200 real batch. if step % (args.eval_steps * args.gradient_accumulation_steps) == 0 and step > 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) # do evaluation totally 10 times during training stage. if args.do_eval == 'yes' and (step + 1) % int( num_train_optimization_steps / 10) == 0 and step > 450: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] eval_examples = read_examples(os.path.join( args.data_dir, file), is_training=True, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field( eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field( eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_labels = np.concatenate(inference_labels, 0) inference_logits = np.concatenate(inference_logits, 0) model.train() ############################################### num_gold_0 = np.sum(gold_labels == 0) num_gold_1 = np.sum(gold_labels == 1) num_gold_2 = np.sum(gold_labels == 2) right_0 = 0 right_1 = 0 right_2 = 0 error_0 = 0 error_1 = 0 error_2 = 0 for gold_label, inference_label in zip( gold_labels, inference_labels): if gold_label == inference_label: if gold_label == 0: right_0 += 1 elif gold_label == 1: right_1 += 1 else: right_2 += 1 elif inference_label == 0: error_0 += 1 elif inference_label == 1: error_1 += 1 else: error_2 += 1 recall_0 = right_0 / (num_gold_0 + 1e-5) recall_1 = right_1 / (num_gold_1 + 1e-5) recall_2 = right_2 / (num_gold_2 + 1e-5) precision_0 = right_0 / (error_0 + right_0 + 1e-5) precision_1 = right_1 / (error_1 + right_1 + 1e-5) precision_2 = right_2 / (error_2 + right_2 + 1e-5) f10 = 2 * precision_0 * recall_0 / (precision_0 + recall_0 + 1e-5) f11 = 2 * precision_1 * recall_1 / (precision_1 + recall_1 + 1e-5) f12 = 2 * precision_2 * recall_2 / (precision_2 + recall_2 + 1e-5) output_dev_result_file = os.path.join( args.output_dir, "dev_results.txt") with open(output_dev_result_file, 'a', encoding='utf-8') as f: f.write('precision:' + str(precision_0) + ' ' + str(precision_1) + ' ' + str(precision_2) + '\n') f.write('recall:' + str(recall_0) + ' ' + str(recall_1) + ' ' + str(recall_2) + '\n') f.write('f1:' + str(f10) + ' ' + str(f11) + ' ' + str(f12) + '\n' + '\n') eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracy(inference_logits, gold_labels) # draw loss. eval_F1.append(round(eval_accuracy, 4)) ax.append(step) plt.plot(ax, eval_F1, label='eval_F1', linewidth=1, color='r', marker='o', markerfacecolor='blue', markersize=2) for a, b in zip(ax, eval_F1): plt.text(a, b, b, ha='center', va='bottom', fontsize=8) plt.savefig(args.output_dir + '/labeled.jpg') result = { 'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc and 'dev' in file: print("=" * 80) print("more accurate model arises, now best F1 = ", eval_accuracy) print("Saving Model......") best_acc = eval_accuracy # Save a trained model, only save the model it-self model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if (step+1) / int(num_train_optimization_steps/10) > 9.5: print("=" * 80) print("End of training. Saving Model......") # Save a trained model, only save the model it-self model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "pytorch_model_final_step.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) ''' if args.do_test == 'yes': start_time = time.time() print( '___________________now testing for best eval f1 model_________________________' ) try: del model except: pass gc.collect() args.do_train = 'no' model = BertForSequenceClassification.from_pretrained(os.path.join( args.output_dir, "pytorch_model.bin"), args, config=config) model.half() for layer in model.modules(): if isinstance(layer, torch.nn.modules.batchnorm._BatchNorm): layer.float() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from " "https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) for file, flag in [('test.csv', 'test')]: inference_labels = [] gold_labels = [] eval_examples = read_examples(os.path.join(args.data_dir, file), is_training=False, label_name=args.label_name) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, args.split_num, False) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() # print('test_logits=', logits) label_ids = label_ids.to('cpu').numpy() inference_labels.append(logits) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) logits = np.concatenate(inference_labels, 0) if flag == 'dev': print(flag, accuracy(logits, gold_labels)) elif flag == 'test': df = pd.read_csv(os.path.join(args.data_dir, file)) df['label_0'] = logits[:, 0] df['label_1'] = logits[:, 1] df['label_2'] = logits[:, 2] df[['id', 'label_0', 'label_1', 'label_2']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) # df[['id', 'label_0', 'label_1']].to_csv(os.path.join(args.output_dir, "sub.csv"), index=False) else: raise ValueError('flag not in [dev, test]') print('inference time usd = {}s'.format(time.time() - start_time)) '''
def train(self): if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) # logger.info(f'Fold {split_index + 1}') train_dataloader, eval_dataloader, train_examples, eval_examples = self.create_dataloader() num_train_optimization_steps = self.train_steps # Prepare model config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained(self.model_name_or_path,self.args, config=config) model.to(self.device) model.train() # Prepare optimizer param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': self.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.learning_rate, eps=self.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=self.train_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", self.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_acc = 0 best_MRR = 0 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 train_dataloader = cycle(train_dataloader) for step in range(num_train_optimization_steps): batch = next(train_dataloader) batch = tuple(t.to(self.device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) tr_loss += loss.item() train_loss = round(tr_loss / (nb_tr_steps + 1), 4) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (nb_tr_steps + 1) % self.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() global_step += 1 if (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("***** Report result *****") logger.info(" %s = %s", 'global_step', str(global_step)) logger.info(" %s = %s", 'train loss', str(train_loss)) if self.do_eval and (step + 1) % (self.eval_steps * self.gradient_accumulation_steps) == 0: for file in ['dev.csv']: inference_labels = [] gold_labels = [] inference_logits = [] scores = [] ID = [x.guid for x in eval_examples] logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", self.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids,label_ids in eval_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): tmp_eval_loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask ) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() inference_labels.append(np.argmax(logits, axis=1)) scores.append(logits) gold_labels.append(label_ids) inference_logits.append(logits) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 gold_labels = np.concatenate(gold_labels, 0) inference_logits = np.concatenate(inference_logits, 0) scores = np.concatenate(scores, 0) model.train() eval_loss = eval_loss / nb_eval_steps eval_accuracy = accuracyF1(inference_logits, gold_labels) print( 'eval_F1',eval_accuracy, 'global_step',global_step, 'loss',train_loss ) result = {'eval_loss': eval_loss, 'eval_F1': eval_accuracy, 'global_step': global_step, 'loss': train_loss} output_eval_file = os.path.join(self.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('*' * 80) writer.write('\n') if eval_accuracy > best_acc : print("=" * 80) print("Best F1", eval_accuracy) print("Saving Model......") # best_acc = eval_accuracy best_acc = eval_accuracy # Save a trained model model_to_save = model.module if hasattr(model,'module') else model output_model_file = os.path.join(self.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) print("=" * 80) else: print("=" * 80)
def test_eval(self): data = DATAMultiWOZ( debug=False, data_dir=self.data_dir ) test_examples = data.read_examples(os.path.join(self.data_dir, 'test.tsv')) print('eval_examples的数量', len(test_examples)) ID = [x.guid for x in test_examples] test_features = data.convert_examples_to_features(test_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(test_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field(test_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field(test_features, 'segment_ids'), dtype=torch.long) all_utterance_mask = torch.tensor(data.select_field(test_features, 'utterance_mask'), dtype=torch.long) all_response_mask = torch.tensor(data.select_field(test_features, 'response_mask'), dtype=torch.long) all_history_mask = torch.tensor(data.select_field(test_features, 'history_mask'), dtype=torch.long) all_label = torch.tensor([f.label for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_utterance_mask,all_response_mask,all_history_mask, all_label) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path, num_labels=self.num_labels) model = BertForSequenceClassification.from_pretrained( os.path.join(self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() inference_labels = [] gold_labels = [] scores = [] for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) label_ids = label_ids.to(self.device) with torch.no_grad(): logits = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, ).detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() scores.append(logits) inference_labels.append(np.argmax(logits, axis=1)) gold_labels.append(label_ids) gold_labels = np.concatenate(gold_labels, 0) scores = np.concatenate(scores, 0) logits = np.concatenate(inference_labels, 0) # 计算评价指标 assert len(ID) == scores.shape[0]== scores.shape[0] eval_accuracy = accuracyF1(logits, gold_labels) # eval_DOUBAN_MRR,eval_DOUBAN_mrr,eval_DOUBAN_MAP,eval_Precision1 = compute_DOUBAN(ID,scores,gold_labels) # print( # 'eval_MRR',eval_DOUBAN_MRR,eval_DOUBAN_mrr, # 'eval_MAP',eval_DOUBAN_MAP, # 'eval_Precision1',eval_Precision1) print('F1',eval_accuracy)