def load_model(self): self.tokenizer = BertTokenizer.from_pretrained( self.args.pretrained_path, do_lower_case=self.args.do_lower_case) self.config = BertConfig.from_pretrained( self.args.pretrained_path, num_labels=self.args.num_labels) if self.args.resume_model: self.model = BertForSequenceClassification.from_pretrained( self.args.resume_model_path, config=self.config) else: self.model = BertForSequenceClassification.from_pretrained( self.args.pretrained_path, config=self.config) if self.args.cuda: self.model.cuda() if self.args.n_gpus > 1: self.model = DataParallel(self.model)
def __init__(self, n_bert_layers=N_LAYERS, n_features=N_FEATURES, extract_features=False): super(BERTForFeatures, self).__init__() self.extract_features = extract_features self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased") self.bert.bert.encoder.layer = self.bert.bert.encoder.layer[:n_bert_layers] self.bert.classifier = nn.Linear(768, n_features) self.cls = nn.Linear(n_features, 1)
def load_model(model_name, data_dir): processors = { "rte": RteProcessor } output_modes = { "rte": "classification" } # task_name = args.task_name.lower() task_name = 'rte' if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() # [0,1] num_labels = len(label_list) pretrain_model_dir = '{}/FineTuneOn{}'.format(data_dir, model_name) # pretrain_model_dir = 'please enter your pretrain models path here/FineTuneOn{}'.format(model_name) # Prepare model # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1)) # # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1)) model = BertForSequenceClassification.from_pretrained(pretrain_model_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir) # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', # cache_dir=cache_dir, # num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # print(tokenizer) return model, tokenizer
def setup_class(self): self.use_gpu = torch.cuda.is_available() self.test_dir = Path(tempfile.mkdtemp()) self.base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyBertTokenizer( get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['bert-base-uncased'])) self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=False).eval() if self.use_gpu: self.model.cuda() self.sentence_list = ['For instance, on the planet Earth, man had always assumed that he was more intelligent ' 'than dolphins because he had achieved so much—the wheel, New York, wars and so on—whilst' ' all the dolphins had ever done was muck about in the water having a good time. But ' 'conversely, the dolphins had always believed that they were far more intelligent than ' 'man—for precisely the same reasons.'] * 64 # Pre-allocate GPU memory tokens_list = [self.base_tokenizer.tokenize(sentence) for sentence in self.sentence_list] features = [self.base_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list] features = [self.base_tokenizer.prepare_for_model(input, None, add_special_tokens=True, max_length=128) for input in features] all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long) if self.use_gpu: all_input_ids = all_input_ids.cuda() with torch.no_grad(): _ = self.model(all_input_ids)[0].cpu().numpy()
def __init__(self, model_path, num_labels=2): model_file_path = utils.download_if_needed(model_path) self.model = BertForSequenceClassification.from_pretrained( model_file_path, num_labels=num_labels) self.model.to(utils.device) self.model.eval() self.tokenizer = BERTTokenizer(model_file_path)
def from_pretrained(model_id_or_path: str, device: Optional[torch.device] = None): # First, Use the function of from_pretrained to load the model you trained. torch_model = TorchBertForSequenceClassification.from_pretrained( model_id_or_path) # Then, Use the init function of the acceleration model to get it. model = BertForSequenceClassification.from_torch(torch_model, device) model._torch_model = torch_model # prevent destroy torch model. return model
def __init__(self): # Googleの公開している事前学習済みのトークナイザとモデルをロード self.tokenizer = BertTokenizer.from_pretrained( "bert-base-multilingual-cased", do_lower_case=False) self.model = BertForSequenceClassification.from_pretrained( "bert-base-multilingual-cased", num_labels=2) # Google Colabでファインチューニングしたモデルをロード self.model.load_state_dict( torch.load("bert_evaluator.bin", map_location='cpu'))
def __init__(self, model_path, num_labels=2, entailment=False): model_file_path = utils.download_if_needed(model_path) self.model = BertForSequenceClassification.from_pretrained( model_file_path, num_labels=num_labels) self.model.to(utils.get_device()) self.model.eval() if entailment: self.tokenizer = BERTEntailmentTokenizer() else: self.tokenizer = BERTTokenizer(model_file_path)
def get_bert_classifier(model_type: str, num_labels: int, model_file: str = None, device: str = "cpu") -> BertForSequenceClassification: """ Load a BertForSequenceClassification model, either from a model file with a finetuned model, or as a simple pretrained model. Args: model_type: the type of BERT model to load, e.g. "bert-base-uncased" num_labels: the number of cells for the output layer of the classifier model_file: if we load a finetuned model, this is the file where the model is saved device: the device to load the model to ("cpu" or "cuda") Returns: a BertForSequenceClassification model """ if model_file: model_state_dict = torch.load( model_file, map_location=lambda storage, loc: storage) if "distilbert" in model_type: model = DistilBertForSequenceClassification.from_pretrained( model_type, state_dict=model_state_dict, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( model_type, state_dict=model_state_dict, num_labels=num_labels) else: if "distilbert" in model_type: model = DistilBertForSequenceClassification.from_pretrained( model_type, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( model_type, num_labels=num_labels) model.to(device) return model
def __init__(self, label_list, device, cache_dir): self._label_list = label_list self._device = device self._tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True, cache_dir=cache_dir) self._model = BertForSequenceClassification.from_pretrained( BERT_MODEL, num_labels=len(label_list), cache_dir=cache_dir) self._model.to(device) self._optimizer = None self._dataset = {} self._data_loader = {}
def bertForSequenceClassification(*args, **kwargs): """ BertForSequenceClassification is a fine-tuning model that includes BertModel and a sequence-level (sequence or pair of sequences) classifier on top of the BertModel. Note that the classification head is only initialized and has to be trained. The sequence-level classifier is a linear layer that takes as input the last hidden state of the first character in the input sequence (see Figures 3a and 3b in the BERT paper). Args: num_labels: the number (>=2) of classes for the classifier. Example: # Load the tokenizer >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> tokenized_text = tokenizer.tokenize(text) >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] >>> tokens_tensor = torch.tensor([indexed_tokens]) >>> segments_tensors = torch.tensor([segments_ids]) # Load bertForSequenceClassification >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2) >>> model.eval() # Predict the sequence classification logits >>> with torch.no_grad(): seq_classif_logits = model(tokens_tensor, segments_tensors) # Or get the sequence classification loss >>> labels = torch.tensor([1]) >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss """ model = BertForSequenceClassification.from_pretrained(*args, **kwargs) return model
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--data_dir", default='./data/input/', type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default='bert-base-chinese', type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--config_file", default='bert-base-chinese', type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default='xgfy', type=str, required=True, help="The name of the task to train.") parser.add_argument("--vacab_root", default='./data/model/', type=str, required=True, help="The directory where the vocab file is saved.") parser.add_argument("--output_dir", default='./data/output/', type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--weight_name", default='net_weight_1.bin', type=str, ) parser.add_argument("--config_name", default='config_name_1.bin', type=str, ) # Other parameters parser.add_argument("--cache_dir", default="./data/model/", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--log_frq", default=50, type=int) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=1.0, type=int, help="Total number of training epochs to perform.") parser.add_argument("--n_warmup", default=1000, type=int, help="step of training to perform linear learning rate warmup for.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--parall', action='store_true') parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() # 新冠肺炎 processors = { "xgfy": SimProcessor } num_labels_task = { "xgfy": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) # if not os.path.exists(args.output_dir): # os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name] num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.vacab_root, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{0}') # cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(str(args.local_rank))) config = BertConfig.from_pretrained(args.config_file, num_labels=num_labels) model = BertForSequenceClassification.from_pretrained(args.bert_model, config=config, cache_dir=cache_dir) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1 and args.parall: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.n_warmup, num_training_steps=t_total ) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1 and args.parall: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if (global_step) % args.log_frq == 0: logger.info("TrLoss: {:.2f} | Loss: {:.2f} | Lr: {:.2f}".format(tr_loss, loss.item(), scheduler.get_lr()[0])) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, args.weight_name) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, args.config_name) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config) model.load_state_dict(torch.load(output_model_file)) else: output_model_file = os.path.join(args.output_dir, args.weight_name) output_config_file = os.path.join(args.output_dir, args.config_name) config = BertConfig(output_config_file) model = BertForSequenceClassification(config) model.load_state_dict(torch.load(output_model_file)) # model = BertForSequenceClassification.from_pretrained(args.bert_model) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss} logger.info(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
# use 4 threads for BERT inference turbo_transformers.set_num_threads(4) model_id = os.path.join(os.path.dirname(__file__), "bert_model") # the model of huggingface's path tokenizer = BertTokenizer.from_pretrained( model_id) # the initialization of tokenizer turbo_model = BertForSequenceClassification.from_pretrained( model_id, torch.device("cpu:0")) # the initialization of the acceleration model # predict after loading the model text = "Sample input text" inputs = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors="pt") # turbo_result holds the returned logits from TurboTransformers model turbo_result = turbo_model(**inputs) torch_model = TorchBertForSequenceClassification.from_pretrained(model_id) # torch_result holds the returned logits from original Transformers model torch_result = torch_model(**inputs)[0] print(turbo_result) # tensor([[0.2716, 0.0318]], grad_fn=<AddmmBackward>) print( torch_result) # torch_result and turbo_result should hold the same logits # tensor([[0.2716, 0.0318]], grad_fn=<AddmmBackward>)
test_papers = papers[-200:] print("BUILDING TRAIN DATA...") train_iterator, num_train_examples = get_dataloader( ARGS.people, train_papers, tokenizer, ARGS.batch_size, ARGS.working_dir + '/data_cache.train.pkl', test=False) print('DONE. %d EXAMPLES' % num_train_examples) print("BUILDING TEST DATA...") test_iterator, num_test_examples = get_dataloader( ARGS.people, test_papers, tokenizer, ARGS.batch_size, ARGS.working_dir + '/data_cache.test.pkl', test=True) print('DONE. %d EXAMPLES' % num_test_examples) model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', cache_dir=ARGS.working_dir + '/cache') if CUDA: model = model.cuda() optimizer, scheduler = build_optimizer_scheduler( model, int((num_train_examples * ARGS.epochs) / ARGS.batch_size), ARGS.learning_rate) loss_fn = build_loss_fn() for epoch in range(ARGS.epochs): while True: print('TRAIN %d' % epoch) model.train()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=256, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, # "mnli-mm": MnliMismatchedProcessor, # "mrpc": MrpcProcessor, # "sst-2": Sst2Processor, # "sts-b": StsbProcessor, # "qqp": QqpProcessor, # "qnli": QnliProcessor, "rte": RteProcessor # "wnli": WnliProcessor, } output_modes = { # "cola": "classification", # "mnli": "classification", # "mrpc": "classification", # "sst-2": "classification", # "sts-b": "regression", # "qqp": "classification", # "qnli": "classification", "rte": "classification" # "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) train_examples = None # num_train_optimization_steps = None # if args.do_train: # # train_examples = processor.get_train_examples_wenpeng('/home/wyin3/Datasets/glue_data/RTE/train.tsv') # train_examples, seen_types = processor.get_examples_situation_train('/export/home/Dataset/LORELEI/zero-shot-split/train_pu_half_v0.txt') #train_pu_half_v1.txt # # seen_classes=[0,2,4,6,8] # # num_train_optimization_steps = int( # len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # if args.local_rank != -1: # num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format( args.local_rank)) pretrain_model_dir = '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnMNLI' #FineTuneOnCombined'# FineTuneOnMNLI, FineTuneOnFEVER, FineTuneOnRTE model = BertForSequenceClassification.from_pretrained( pretrain_model_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_unseen_acc = 0.0 max_dev_unseen_acc = 0.0 max_dev_seen_acc = 0.0 max_overall_acc = 0.0 '''load test set''' seen_types = set() test_examples, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index = processor.get_examples_situation_test( '/export/home/Dataset/LORELEI/zero-shot-split/test.txt', seen_types) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, output_mode) test_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) test_all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) ''' start evaluate on test set after this epoch ''' model.eval() logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) test_loss = 0 nb_test_steps = 0 preds = [] print('Testing...') for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, input_mask, segment_ids, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] pred_probs = softmax(preds, axis=1)[:, 0] pred_binary_labels_harsh = [] pred_binary_labels_loose = [] for i in range(preds.shape[0]): if preds[i][0] > preds[i][1] + 0.1: pred_binary_labels_harsh.append(0) else: pred_binary_labels_harsh.append(1) if preds[i][0] > preds[i][1]: pred_binary_labels_loose.append(0) else: pred_binary_labels_loose.append(1) seen_acc, unseen_acc = evaluate_situation_zeroshot_TwpPhasePred( pred_probs, pred_binary_labels_harsh, pred_binary_labels_loose, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index, seen_types) if unseen_acc > max_test_unseen_acc: max_test_unseen_acc = unseen_acc print('\n\n\t test seen_f1 & unseen_f1:', seen_acc, unseen_acc, ' max_test_unseen_f1:', max_test_unseen_acc, '\n')
def predict(options): qid_lst, x_row, y_lst = [], [], [] read_file = open(options.data_file, 'rb') read_file.readline() for line in read_file: line = line.decode('utf-8', 'ignore') line_split = line.strip('\n').split('|') x_row.append(line_split[1]) y_lst.append(int(line_split[0])) # for line in read_file: # line = line.decode('utf-8', 'ignore') # line_split = line.strip('\n').strip('\r').split('\t') # qid_lst.append(line_split[0]) # x_row.append(line_split[1]) # if len(line_split) >=3: # y_lst.append(int(line_split[-1])) read_file.close() x_test_text, y_test = np.array(x_row), np.array(y_lst) print("test text shape is ", x_test_text.shape) print('Loading BERT tokenizer...') tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') MAX_LEN = 180 test_input_ids = [ tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN) for sent in x_test_text ] test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post") test_attention_masks = [] for sent in test_input_ids: att_mask = [int(token_id > 0) for token_id in sent] test_attention_masks.append(att_mask) test_inputs = torch.tensor(test_input_ids) test_masks = torch.tensor(test_attention_masks) batch_size = 2 # Create the DataLoader for our test set. test_data = TensorDataset(test_inputs, test_masks) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size) model = BertForSequenceClassification.from_pretrained( "bert-base-chinese", num_labels=5, output_attentions=False, output_hidden_states=False, ) # Tell pytorch to run this model on the GPU. """加载模型""" model.load_state_dict(torch.load(options.model_file)) model.cuda() all_predictions = [] model.eval() for step, batch in enumerate(test_dataloader): if step % 40 == 0 and not step == 0: print(' Batch {:>5,} of {:>5,}.'.format(step, len(test_dataloader))) batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask = batch with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] logits = logits.detach().cpu().numpy() pred_flat = np.argmax(logits, axis=1).flatten() all_predictions.extend(pred_flat) if len(y_test) > 0: eps = 1e-3 TP, FP, FN = 0, 0, 0 for i in range(len(all_predictions)): if all_predictions[i] == y_test[i] and y_test[i] != 0: TP += 1 elif all_predictions[i] == 0 and y_test[i] != 0: FN += 1 elif all_predictions[i] != 0 and y_test[i] == 0: FP += 1 if abs(TP + FP) < eps: P = 0. else: P = float(TP) / float(TP + FP) if abs(TP + FP) < eps: R = 0. else: R = float(TP) / float(TP + FN) if abs(P) < eps and abs(R) < eps: F1 = 0. else: F1 = 4 * P * R / (P + 3 * R) print("TP is ", TP) print("FP is ", FP) print("FN is ", FN) print("Total number of test examples: {}".format(len(y_test))) print("P is ", P) print("R is ", R) print("Score is ", F1) print("predict start.......") ################################### # 预测逻辑和结果输出,("%d\t%s\t%d", qid, content, predict_label) ################################### print("Saving evaluation to {0}".format(options.out_put_file)) output_file = open(options.out_put_file, 'wb') output_file.write("qid\ttext\tlabel\n".encode("utf-8")) for i in range(len(all_predictions)): # output_file.write((qid_lst[i]+'\t'+x_row[i]+'\t'+str(int(all_predictions[i]))+'\n').encode('utf-8', 'ignore')) output_file.write( (x_row[i] + '\t' + str(int(all_predictions[i])) + '\n').encode( 'utf-8', 'ignore')) output_file.close() print("predict end.......") return None
import torch from keras.preprocessing.sequence import pad_sequences if torch.cuda.is_available(): device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) else: device = torch.device("cpu") print('No GPU available, using the CPU instead.') #모델 저장된 경로 output_dir = './model_save4/' # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained(output_dir) tokenizer = BertTokenizer.from_pretrained(output_dir) # Copy the model to the GPU. model.to(device) def convert_input_data(sentences): # BERT의 토크나이저로 문장을 토큰으로 분리 tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] # 입력 토큰의 최대 시퀀스 길이 MAX_LEN = 512 # 토큰을 숫자 인덱스로 변환
def main(): parser = argparse.ArgumentParser() ## Required parameters ''' python -u demo.py ''' parser.add_argument("--premise_str", default=None, type=str, required=True, help="text to classify") parser.add_argument("--hypo_list", default=None, type=str, required=True, help="sentences separated by |") parser.add_argument("--task_name", default='rte', type=str, help="The name of the task to train.") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") # parser.add_argument("--do_lower_case", # action='store_true', # help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=256, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"rte": RteProcessor} output_modes = {"rte": "classification"} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) train_examples = None # Prepare model # cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format(args.local_rank)) # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=cache_dir, # num_labels=num_labels) # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) pretrain_model_dir = '/export/home/Dataset/fine_tune_Bert_stored/FineTuneOnRTE' #FineTuneOnCombined'# FineTuneOnMNLI model = BertForSequenceClassification.from_pretrained( pretrain_model_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer # param_optimizer = list(model.named_parameters()) # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} # ] # optimizer = AdamW(optimizer_grouped_parameters, # lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_unseen_acc = 0.0 max_dev_unseen_acc = 0.0 max_dev_seen_acc = 0.0 max_overall_acc = 0.0 '''load test set''' seen_types = set() # test_examples, test_label_list, test_hypo_seen_str_indicator, test_hypo_2_type_index = processor.get_examples_Yahoo_test('/export/home/Dataset/YahooClassification/yahoo_answers_csv/zero-shot-split/test.txt', seen_types) # test_examples = load_demo_input(premise_str, hypo_list) # test_examples = load_demo_input('f**k why my email not come yet', ['anger', 'this text expresses anger', 'the guy is very unhappy']) test_examples = load_demo_input(args.premise_str, args.hypo_list.split(' | ')) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, output_mode) test_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) test_all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) ''' start evaluate on test set after this epoch ''' model.eval() logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) test_loss = 0 nb_test_steps = 0 preds = [] # print('Testing...') for input_ids, input_mask, segment_ids, label_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) logits = logits[0] if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps preds = preds[0] pred_probs = softmax(preds, axis=1)[:, 0] return max(pred_probs)
def main(train_file, valid_file, test_file, target_dir, embedding_size=512, hidden_size=512, dropout=0.5, num_classes=3, epochs=64, batch_size=32, lr=0.0004, patience=5, max_grad_norm=1.0, checkpoint=None): """ Train the ESIM model on the Quora dataset. Args: train_file: A path to some preprocessed data that must be used to train the model. valid_file: A path to some preprocessed data that must be used to validate the model. embeddings_file: A path to some preprocessed word embeddings that must be used to initialise the model. target_dir: The path to a directory where the trained model must be saved. hidden_size: The size of the hidden layers in the model. Defaults to 300. dropout: The dropout rate to use in the model. Defaults to 0.5. num_classes: The number of classes in the output of the model. Defaults to 3. epochs: The maximum number of epochs for training. Defaults to 64. batch_size: The size of the batches for training. Defaults to 32. lr: The learning rate for the optimizer. Defaults to 0.0004. patience: The patience to use for early stopping. Defaults to 5. checkpoint: A checkpoint from which to continue training. If None, training starts from scratch. Defaults to None. """ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(20 * "=", " Preparing for training ", 20 * "=") if not os.path.exists(target_dir): os.makedirs(target_dir) # -------------------- Data loading ------------------- # print("\t* Loading training data...") with open(train_file, "rb") as pkl: train_data = pickle.load(pkl) print("\t* Loading validation data...") with open(valid_file, "rb") as pkl: valid_data = pickle.load(pkl) valid_dataloader = transform_batch_data(valid_data, batch_size=batch_size, shuffle=False) print("\t* Loading test data...") with open(test_file, "rb") as pkl: test_data = pickle.load(pkl) test_dataloader = transform_batch_data(test_data, batch_size=batch_size, shuffle=False) # -------------------- Model definition ------------------- # print("\t* Building model...") pretrained_weights = 'bert-base-uncased' if checkpoint: tokenizer = BertTokenizer.from_pretrained(target_dir+'/transformer/') model = BertForSequenceClassification.from_pretrained(target_dir+'/transformer/') else: tokenizer = BertTokenizer.from_pretrained(pretrained_weights) model = BertForSequenceClassification.from_pretrained(pretrained_weights) print("\t* Building model success...") model.to(device) # -------------------- Preparation for training ------------------- # # Parameters: lr = 1e-3 max_grad_norm = 1.0 num_total_steps = 1000 num_warmup_steps = 100 warmup_proportion = float(num_warmup_steps) / float(num_total_steps) # 0.1 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) # PyTorch scheduler best_score = 0.0 start_epoch = 1 # Data for loss curves plot. epochs_count = [] train_losses = [] valid_losses = [] # Compute loss and accuracy before starting (or resuming) training. _, valid_loss, valid_accuracy = validate(model, tokenizer, valid_dataloader ) print("\t* Validation loss before training: {:.4f}, accuracy: {:.4f}%" .format(valid_loss, (valid_accuracy*100))) _, test_loss, test_accuracy = validate(model, tokenizer, test_dataloader) print("\t* test loss before training: {:.4f}, accuracy: {:.4f}%" .format(test_loss, (test_accuracy*100))) # -------------------- Training epochs ------------------- # print("\n", 20 * "=", "Training transformer model on device: {}".format(device), 20 * "=") patience_counter = 0 for epoch in range(start_epoch, epochs+1): train_dataloader = transform_batch_data(train_data, batch_size=batch_size, shuffle=True) epochs_count.append(epoch) print("* Training epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = train(model, tokenizer, train_dataloader, optimizer, scheduler, max_grad_norm) train_losses.append(epoch_loss) print("-> Training time: {:.4f}s, loss = {:.4f}, accuracy: {:.4f}%" .format(epoch_time, epoch_loss, (epoch_accuracy*100))) print("* Validation for epoch {}:".format(epoch)) epoch_time, epoch_loss, epoch_accuracy = validate(model, tokenizer, valid_dataloader) valid_losses.append(epoch_loss) print("-> Valid. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n" .format(epoch_time, epoch_loss, (epoch_accuracy*100))) print("* Test for epoch {}:".format(epoch)) epoch_time, epoch_loss, test_accuracy = validate(model, tokenizer, test_dataloader) print("-> Test. time: {:.4f}s, loss: {:.4f}, accuracy: {:.4f}%\n" .format(epoch_time, epoch_loss, (test_accuracy*100))) sys.stdout.flush() #刷新输出 # Update the optimizer's learning rate with the scheduler. scheduler.step(epoch_accuracy) # Early stopping on validation accuracy. if epoch_accuracy < best_score: patience_counter += 1 else: best_score = epoch_accuracy patience_counter = 0 # Save the best model. The optimizer is not saved to avoid having # a checkpoint file that is too heavy to be shared. To resume # training from the best model, use the 'esim_*.pth.tar' # checkpoints instead. torch.save({"epoch": epoch, "model": model.state_dict(), "best_score": best_score, "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(target_dir, "best.pth.tar")) # Save the model at each epoch. torch.save({"epoch": epoch, "model": model.state_dict(), "best_score": best_score, "optimizer": optimizer.state_dict(), "epochs_count": epochs_count, "train_losses": train_losses, "valid_losses": valid_losses}, os.path.join(target_dir, "esim_{}.pth.tar".format(epoch))) if patience_counter >= patience: print("-> Early stopping: patience limit reached, stopping...") break # Plotting of the loss curves for the train and validation sets. fig = plt.figure() plt.plot(epochs_count, train_losses, "-r") plt.plot(epochs_count, valid_losses, "-b") plt.xlabel("epoch") plt.ylabel("loss") plt.legend(["Training loss", "Validation loss"]) plt.title("Cross entropy loss") fig.savefig('quora_loss.png')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--train_data_dir", default='./', type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--test_data_dir", default='./', type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default='bert-base-uncased', type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default='ci_evaluation', type=str, required=False, help="The name of the task to train.") parser.add_argument( "--output_dir", default="./saves/", type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--load_model", action='store_true', help="Whether to load a fine-tuned model from output directory.") parser.add_argument( "--model_name", default=None, type=str, help= "The name of the model to load, relevant only in case that load_model is positive." ) parser.add_argument("--load_model_path", default='./saves/pytorch_model.bin', type=str, help="Path to directory containing fine-tuned model.") parser.add_argument( "--save_on_epoch_end", action='store_true', help="Whether to save the weights each time an epoch ends.") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--N_train", type=int, default=-1, help="number of training examples") parser.add_argument("--N_dev", type=int, default=-1, help="number of development examples") parser.add_argument( "--save_best_weights", type=bool, default=True, help="saves model weight each time epoch accuracy is maximum") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=5, type=int, help="Total number of training epochs to perform.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--save_according_to', type=str, default='acc', help="save results according to in domain dev acc or in domain dev loss" ) parser.add_argument('--optimizer', type=str, default='adam', help="which optimizer model to use: adam or sgd") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--data_type', type=str, default='sports', help="Evaluate classifier on sports or wiki") parser.add_argument( "--soph_flag", action='store_true', help="Evaluate on sophisticated examples or not (unsophisticated).") parser.add_argument( "--generation_flag", action='store_true', help= "Evaluate on original examples or caunterfactuals (generated ones).") args = parser.parse_args() processors = { "sentiment": DiscoFuseProcessor, "ci_evaluation": GeneratedDiscoFuseProcessor, } output_modes = { "sentiment": "classification", "ci_evaluation": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') print("device: {} n_gpu: {}, distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) print("learning rate: {}, batch size: {}".format(args.learning_rate, args.train_batch_size)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.load_model: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists( args.output_dir) and args.task_name != "ci_evaluation": os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.train_data_dir) train_examples = train_examples[:args. N_train] if args.N_train > 0 else train_examples num_train_optimization_steps = int( len(train_examples) / train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Load a trained model and vocabulary that you have fine-tuned if args.load_model or args.load_model_path != '': # path to directory to load from fine-tuned model load_path = args.load_model_path if args.load_model_path != '' else args.output_dir cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if task_name == 'sentiment' or task_name == 'ci_evaluation': model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=args.cache_dir, num_labels=num_labels) else: print('Error! no task named: {}'.format(task_name)) exit() # load pre train modek weights if args.load_model_path is not None: print("--- Loading model:", args.load_model_path) model.load_state_dict(torch.load(args.load_model_path), strict=False) else: model.load_state_dict(torch.load( os.path.join(load_path, "pytorch_model.bin")), strict=False) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) if not tokenizer: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) model.to(device) else: cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if task_name == "sentiment": model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=args.cache_dir, num_labels=num_labels) else: print('Error! no task named: {}'.format(task_name)) exit() tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) model.to(device) if args.local_rank != -1: model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = model.named_parameters() no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.optimizer == 'adam': optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) elif args.optimizer == 'sgd': optimizer = torch.optim.sgd(model.parameters(), lr=args.learning_rate, weight_decay=1e-2) # scheduler = ReduceLROnPlateau(optimizer, 'min', # patience=hparams.reduce_lr_on_plateau_patience, # factor=hparams.reduce_lr_on_plateau_factor, verbose=True) global_step = 0 # prepare dev-set evaluation DataLoader # if do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if task_name == 'ci_evaluation': eval_dataloader = make_DataLoader(data_dir=args.test_data_dir, processor=processor, tokenizer=tokenizer, label_list=label_list, max_seq_length=args.max_seq_length, batch_size=args.eval_batch_size, output_mode=output_mode, local_rank=args.local_rank, mode="dev", N=args.N_dev, data_type=args.data_type, soph_flag=soph_flag, generation_flag=generation_flag) else: eval_dataloader = make_DataLoader(data_dir=args.test_data_dir, processor=processor, tokenizer=tokenizer, label_list=label_list, max_seq_length=args.max_seq_length, batch_size=args.eval_batch_size, output_mode=output_mode, local_rank=args.local_rank, mode="dev", N=args.N_dev) if args.do_train: # prepare training DataLoader train_dataloader = make_DataLoader(data_dir=args.train_data_dir, processor=processor, tokenizer=tokenizer, label_list=label_list, max_seq_length=args.max_seq_length, batch_size=train_batch_size, output_mode=output_mode, local_rank=args.local_rank, mode="train", N=args.N_train) model.train() # main training loop best_dev_acc = 0.0 best_dev_loss = 100000.0 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 tr_acc = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch[:4] # define a new function to compute loss values for both output_modes outputs = model(input_ids, segment_ids, input_mask, labels=None) logits = outputs[0] if output_mode == "classification": loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) preds = logits.detach().cpu().numpy() preds = np.argmax(preds, axis=1) tr_acc += compute_metrics( task_name, preds, label_ids.detach().cpu().numpy())["acc"] elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # run evaluation on dev set # dev-set loss eval_results_dev = evaluate(eval_dataloader=eval_dataloader, model=model, device=device, tokenizer=tokenizer, output_mode=output_mode, num_labels=num_labels) dev_acc, dev_loss = eval_results_dev[:2] # train-set loss tr_loss /= nb_tr_steps tr_acc /= nb_tr_steps # print and save results result = { "acc": tr_acc, "loss": tr_loss, "dev_acc": dev_acc, "dev_loss": dev_loss } print('Epoch {}'.format(epoch + 1)) for key, val in result.items(): print("{}: {}".format(key, val)) print("***** Evaluation results *****") for key in sorted(result.keys()): print(" %s = %s", key, str(result[key])) # Save model, configuration and tokenizer on the first epoch # If we save using the predefined names, we can load using `from_pretrained` model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self if epoch == 0: output_config_file = os.path.join(args.output_dir, CONFIG_NAME) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) if args.save_on_epoch_end: # Save a trained model output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME + '.Epoch_{}'.format(epoch + 1)) torch.save(model_to_save.state_dict(), output_model_file) # save model with best performance on dev-set if args.save_best_weights and dev_acc > best_dev_acc: print("Saving model, accuracy improved from {} to {}".format( best_dev_acc, dev_acc)) output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) best_dev_acc = dev_acc if args.save_according_to == 'acc': if dev_acc > best_dev_acc: best_dev_acc = dev_acc elif args.save_according_to == 'loss': if dev_loss < best_dev_loss: best_dev_loss = dev_loss if args.save_according_to == 'acc': print('Best results: Acc - {}'.format(best_dev_acc)) elif args.save_according_to == 'loss': print('Best results: Loss - {}'.format(best_dev_loss)) if args.model_name is not None: final_output_eval_file = os.path.join( args.output_dir, args.model_name + "-final_eval_results.txt") else: final_output_eval_file = os.path.join( args.output_dir, "final_eval_results.txt") elif args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # dev-set loss eval_results_dev = evaluate(eval_dataloader=eval_dataloader, model=model, device=device, tokenizer=tokenizer, output_mode=output_mode, num_labels=num_labels) print("eval_results_dev -", eval_results_dev) dev_acc, dev_loss = eval_results_dev[:2] # print results print('Accuracy: {}'.format(dev_acc)) print('Loss: {}'.format(dev_loss)) else: raise ValueError( "At least one of `do_train` or `do_eval` must be True.")