def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--model_dir", default=None, type=str, required=True, help="") parser.add_argument("--my_config", default=None, type=str, required=True) parser.add_argument("--feature_path", default=None, type=str, required=True) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_pattern", default=None, type=str, help="training data path.") parser.add_argument("--valid_pattern", default=None, type=str, help="validation data path.") parser.add_argument("--test_pattern", default=None, type=str, help="test data path.") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--report_steps", default=100, type=int, help="report steps when training.") parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=-1, type=int) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() print(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') n_gpu = 1 logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_pattern: raise ValueError( "If `do_train` is True, then `train_pattern` must be specified." ) if args.do_predict: if not args.test_pattern: raise ValueError( "If `do_predict` is True, then `test_pattern` must be specified." ) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Prepare model my_config = Config(args.my_config) my_config.num_edge_types = sum(EdgePosition.max_edge_types) my_config.forward_edges = [ EdgeType.TOKEN_TO_SENTENCE, EdgeType.SENTENCE_TO_PARAGRAPH, EdgeType.PARAGRAPH_TO_DOCUMENT ] print(my_config) if args.do_train: pretrained_config_file = os.path.join(args.model_dir, CONFIG_NAME) bert_config = BertConfig(pretrained_config_file) pretrained_model_file = os.path.join(args.model_dir, WEIGHTS_NAME) model = NqModel(bert_config=bert_config, my_config=my_config) model_dict = model.state_dict() pretrained_model_dict = torch.load( pretrained_model_file, map_location=lambda storage, loc: storage) pretrained_model_dict = { k: v for k, v in pretrained_model_dict.items() if k in model_dict.keys() } model_dict.update(pretrained_model_dict) model.load_state_dict(model_dict) else: pretrained_config_file = os.path.join(args.model_dir, CONFIG_NAME) bert_config = BertConfig(pretrained_config_file) model = NqModel(bert_config=bert_config, my_config=my_config) pretrained_model_file = os.path.join(args.model_dir, WEIGHTS_NAME) model.load_state_dict(torch.load(pretrained_model_file)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) num_train_features = None num_train_optimization_steps = None if args.do_train: num_train_features = 0 for data_path in glob(args.train_pattern): train_dataset = NqDataset(args, data_path, is_training=True) num_train_features += len(train_dataset.features) num_train_optimization_steps = int( num_train_features / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: if args.warmup_steps > 0: args.warmup_proportion = min( args.warmup_proportion, args.warmup_steps / num_train_optimization_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num split examples = %d", num_train_features) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() tr_loss, report_loss = 0.0, 0.0 nb_tr_examples = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): for data_path in glob(args.train_pattern): logging.info("Reading data from {}.".format(data_path)) train_dataset = NqDataset(args, data_path, is_training=True) train_features = train_dataset.features if args.local_rank == -1: train_sampler = RandomSampler(train_features) else: train_sampler = DistributedSampler(train_features) train_dataloader = DataLoader(train_features, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=batcher( device, is_training=True), num_workers=0) for step, batch in enumerate(train_dataloader): loss = model(batch.input_ids, batch.input_mask, batch.segment_ids, batch.st_mask, batch.st_index, (batch.edges_src, batch.edges_tgt, batch.edges_type, batch.edges_pos), batch.start_positions, batch.end_positions, batch.answer_type) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.local_rank != -1: loss = loss + 0 * sum( [x.sum() for x in model.parameters()]) if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 tr_loss += loss.item() nb_tr_examples += 1 if (step + 1) % args.gradient_accumulation_steps == 0 and ( global_step + 1) % args.report_steps == 0 and ( args.local_rank == -1 or torch.distributed.get_rank() == 0): lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) logging.info( "Epoch={} iter={} lr={:.6f} train_ave_loss={:.6f} ." .format( # _, global_step, lr_this_step, tr_loss / nb_tr_examples)) _, global_step, lr_this_step, (tr_loss - report_loss) / args.report_steps)) report_loss = tr_loss if args.valid_pattern and (args.local_rank == -1 or torch.distributed.get_rank() == 0): valid_result = eval_model(args, device, model, args.valid_pattern) logging.info("valid_result = {}".format(valid_result)) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) bert_config = BertConfig(output_config_file) model = NqModel(bert_config=bert_config, my_config=my_config) model.load_state_dict(torch.load(output_model_file)) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_result = eval_model(args, device, model, args.test_pattern) logging.info("test_result = {}".format(test_result))
def train(self,data,no_cache=False,method="sum"): model =self.model device = self.device tokenizer=self.tokenizer learning_rate = self.learning_rate warmup_proportion = self.warmup_proportion num_labels = self.num_labels n_gpu = self.n_gpu # GET TRAIN SAMPLES - CACHE THE TOKENS # GET EVAL SAMPLES - CACHE THE TOKENS train_examples = self.create_examples(data["train"]) val_examples = self.create_examples(data["val"],) if "val" in data else None test_examples_list = [] for sample in data["test"]: test_examples = self.create_examples(sample,) if "test" in data else None test_examples_list.append(test_examples) train_dataloader,train_index = self.get_dataloader(train_examples,"train") test_dataloader_list = [] for index,examples in enumerate(test_examples_list): test_dataloader,test_index = self.get_dataloader(examples,"test"+str(index)) test_dataloader_list.append((test_dataloader,test_index)) val_dataloader,val_index = self.get_dataloader(val_examples,"val") num_train_steps = int( len(train_examples) / self.train_batch_size / self.gradient_accumulation_steps * self.num_of_epochs) # OPTIMIZERS MODEL INITIALIZATION # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps # optimizer = BertAdam(optimizer_grouped_parameters, # lr=learning_rate, # warmup=warmup_proportion, # t_total=t_total) if self.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.learning_rate, bias_correction=False, max_grad_norm=1.0) print("Optimizer: FusedAdam") if self.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=self.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=num_train_steps) # TRAIN FOR EPOCHS and SAVE EACH MODEL as pytorch_model.bin.{epoch} global_step = 0 nb_tr_steps = 0 tr_loss = 0 ep = 0 output_model_file = "dummy" loss_fct = CrossEntropyLoss() for _ in trange(int(self.num_of_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 ep += 1 tq = tqdm(train_dataloader, desc="Iteration") acc = 0 for step, batch in enumerate(tq): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids,unique_ids = batch logits = model(input_ids, segment_ids, input_mask) loss = loss_fct(logits, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_accuracy = accuracy(logits, label_ids) acc += tmp_accuracy if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps if self.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % self.gradient_accumulation_steps == 0: if self.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = self.learning_rate * warmup_linear(global_step/num_train_steps, self.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 tq.set_description("Loss:"+str(tr_loss/nb_tr_steps)+",Acc:"+str(acc/nb_tr_examples)) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(self.output_dir, "pytorch_model.bin." + str(ep)) torch.save(model_to_save.state_dict(), output_model_file) # EVAL IN EACH EPOCH SAVE BEST MODEL as best_model.bin if val_dataloader: self.score_qa(val_dataloader,val_index,data["val"],model,"val",ep,method) if test_dataloader_list: for index,tup in enumerate(test_dataloader_list): self.score_qa(tup[0],tup[1],data["test"][index],model,"test"+str(index),ep,method) print("After Current-Epoch:",self.best_metric) return model,self.best_metric
def init_tri_model_optimizer(model, args, data_loader): num_train_optimization_steps = None if args.do_train: train_examples = data_loader.dataset num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) param_optimizer = list(model.named_parameters()) no_decay1 = [ 'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'classifier2.bias', 'classifier2.weight', 'classifier3.bias', 'classifier3.weight' ] no_decay2 = [ 'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'classifier1.bias', 'classifier1.weight', 'classifier3.bias', 'classifier3.weight' ] no_decay3 = [ 'bias', 'LayerNorm.bias', 'LayerNorm.weight', 'classifier1.bias', 'classifier1.weight', 'classifier2.bias', 'classifier2.weight' ] optimizer_grouped_parameters1 = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay1) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay1)], 'weight_decay': 0.0 }] optimizer_grouped_parameters2 = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay2) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay2)], 'weight_decay': 0.0 }] optimizer_grouped_parameters3 = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay3) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay3)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer1 = FusedAdam(optimizer_grouped_parameters1, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) optimizer2 = FusedAdam(optimizer_grouped_parameters2, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) optimizer3 = FusedAdam(optimizer_grouped_parameters3, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer1 = FP16_Optimizer(optimizer1, dynamic_loss_scale=True) optimizer2 = FP16_Optimizer(optimizer2, dynamic_loss_scale=True) optimizer3 = FP16_Optimizer(optimizer3, dynamic_loss_scale=True) else: optimizer1 = FP16_Optimizer(optimizer1, static_loss_scale=args.loss_scale) optimizer2 = FP16_Optimizer(optimizer2, static_loss_scale=args.loss_scale) optimizer3 = FP16_Optimizer(optimizer3, static_loss_scale=args.loss_scale) else: optimizer1 = BertAdam(optimizer_grouped_parameters1, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) optimizer2 = BertAdam(optimizer_grouped_parameters2, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) optimizer3 = BertAdam(optimizer_grouped_parameters3, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) return optimizer1, optimizer2, optimizer3, num_train_optimization_steps
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) == False: os.makedirs(args.output_dir, exist_ok=True) # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_path = os.path.join(args.data_dir, 'train_merge.csv') train_examples = read_race_examples(train_path) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_choices=4) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info("Training Epoch: {}/{}".format( ep + 1, int(args.num_train_epochs))) for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % 100 == 0: logger.info("Training loss: {}, global step: {}".format( tr_loss / nb_tr_steps, global_step)) dev_set = os.path.join(args.data_dir, 'dev_merge.csv') eval_examples = read_race_examples(dev_set) eval_features = convert_examples_to_features( eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field( eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'dev_eval_loss': eval_loss, 'dev_eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Dev results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "merge_pytorch_model_" + str(ep) + ".bin") torch.save(model_to_save.state_dict(), output_model_file) ## Load a trained model that you have fine-tuned ## use this part if you want to load the trained model # model_state_dict = torch.load(output_model_file) # model = BertForMultipleChoice.from_pretrained(args.bert_model, # state_dict=model_state_dict, # num_choices=4) # model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_dir = os.path.join(args.data_dir, 'test') test_high = [test_dir + '/high'] test_middle = [test_dir + '/middle'] ## test high eval_examples = read_race_examples(test_high) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test high *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() high_eval_loss, high_eval_accuracy = 0, 0 high_nb_eval_steps, high_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) high_eval_loss += tmp_eval_loss.mean().item() high_eval_accuracy += tmp_eval_accuracy high_nb_eval_examples += input_ids.size(0) high_nb_eval_steps += 1 eval_loss = high_eval_loss / high_nb_eval_steps eval_accuracy = high_eval_accuracy / high_nb_eval_examples result = { 'high_eval_loss': eval_loss, 'high_eval_accuracy': eval_accuracy } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## test middle eval_examples = read_race_examples(test_middle) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation: test middle *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() middle_eval_loss, middle_eval_accuracy = 0, 0 middle_nb_eval_steps, middle_nb_eval_examples = 0, 0 for step, batch in enumerate(eval_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) middle_eval_loss += tmp_eval_loss.mean().item() middle_eval_accuracy += tmp_eval_accuracy middle_nb_eval_examples += input_ids.size(0) middle_nb_eval_steps += 1 eval_loss = middle_eval_loss / middle_nb_eval_steps eval_accuracy = middle_eval_accuracy / middle_nb_eval_examples result = { 'middle_eval_loss': eval_loss, 'middle_eval_accuracy': eval_accuracy } with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## all test eval_loss = (middle_eval_loss + high_eval_loss) / ( middle_nb_eval_steps + high_nb_eval_steps) eval_accuracy = (middle_eval_accuracy + high_eval_accuracy) / ( middle_nb_eval_examples + high_nb_eval_examples) result = { 'overall_eval_loss': eval_loss, 'overall_eval_accuracy': eval_accuracy } with open(output_eval_file, "a+") as writer: for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): logger.info("Running %s" % ' '.join(sys.argv)) parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--scan", default="horizontal", choices=["vertical", "horizontal"], type=str, help="The direction of linearizing table cells.") parser.add_argument( "--data_dir", default="../processed_datasets", type=str, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--output_dir", default="outputs", type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--load_dir", type=str, help= "The output directory where the model checkpoints will be loaded during evaluation" ) parser.add_argument('--load_step', type=int, default=0, help="The checkpoint step to be loaded") parser.add_argument("--fact", default="first", choices=["first", "second"], type=str, help="Whether to put fact in front.") parser.add_argument( "--test_set", default="dev", choices=["dev", "test", "simple_test", "complex_test", "small_test"], help="Which test set is used for evaluation", type=str) parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--balance", action='store_true', help="balance between + and - samples for training.") ## Other parameters parser.add_argument( "--bert_model", default="bert-base-multilingual-cased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default="QQP", type=str, help="The name of the task to train.") parser.add_argument('--period', type=int, default=500) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=6, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=20.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() pprint(vars(args)) sys.stdout.flush() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "qqp": QqpProcessor, } output_modes = { "qqp": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") args.output_dir = "{}_fact-{}_{}".format(args.output_dir, args.fact, args.scan) args.data_dir = os.path.join(args.data_dir, "tsv_data_{}".format(args.scan)) logger.info( "Datasets are loaded from {}\n Outputs will be saved to {}".format( args.data_dir, args.output_dir)) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) writer = SummaryWriter(os.path.join(args.output_dir, 'events')) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if args.load_dir: load_dir = args.load_dir else: load_dir = args.bert_model model = BertForSequenceClassification.from_pretrained( load_dir, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode, fact_place=args.fact, balance=args.balance) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): logger.info("Training epoch {} ...".format(epoch)) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() writer.add_scalar('train/loss', loss, global_step) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: total_norm = 0.0 for n, p in model.named_parameters(): if p.grad is not None: param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 total_norm = total_norm**(1. / 2) preds = torch.argmax(logits, -1) == label_ids acc = torch.sum(preds).float() / preds.size(0) writer.add_scalar('train/gradient_norm', total_norm, global_step) if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() model.zero_grad() global_step += 1 if (step + 1) % args.period == 0: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_dir = os.path.join( args.output_dir, 'save_step_{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) output_model_file = os.path.join(output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_dir) model.eval() torch.set_grad_enabled(False) # turn off gradient tracking evaluate(args, model, device, processor, label_list, num_labels, tokenizer, output_mode, tr_loss, global_step, task_name, tbwriter=writer, save_dir=output_dir) model.train() # turn on train mode torch.set_grad_enabled(True) # start gradient tracking tr_loss = 0 # do eval before exit if args.do_eval: if not args.do_train: global_step = 0 output_dir = None save_dir = output_dir if output_dir is not None else args.load_dir tbwriter = SummaryWriter(os.path.join(save_dir, 'eval/events')) load_step = args.load_step if args.load_dir is not None: load_step = int( os.path.split(args.load_dir)[1].replace('save_step_', '')) print("load_step = {}".format(load_step)) evaluate(args, model, device, processor, label_list, num_labels, tokenizer, output_mode, tr_loss, global_step, task_name, tbwriter=tbwriter, save_dir=save_dir, load_step=load_step)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run evaluation.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type = float, default = 0, help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") #args = parser.parse_args() args = parser.parse_args(["--train_file","/home/xiongyi/Data/Corpus/small_wiki_sentence_corpus.txt","--do_eval","--bert_model",\ "bert-base-uncased","--output_dir","june10"]) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") #n_gpu = torch.cuda.device_count() device = torch.device("cuda", 1) n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', rank = 1, world_size=2) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) model = DisentangleModel(model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) model.eval() new_model = next(model.children()) ##use probing/downstream_tasks to evaluate the model # Set params for SentEval params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 32, 'tenacity': 3, 'epoch_size': 2} params_senteval['DEbert']=new_model params_senteval['DEbert'].tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) params_senteval['DEbert'].device = device se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion'] results = se.eval(transfer_tasks) print(results)
def main(): # ArgumentParser对象保存了所有必要的信息,用以将命令行参数解析为相应的python数据类型 parser = argparse.ArgumentParser() # required parameters # 调用add_argument()向ArgumentParser对象添加命令行参数信息,这些信息告诉ArgumentParser对象如何处理命令行参数 parser.add_argument( "--data_dir", default='/users4/xhu/SMP/similarity_data', #default='/home/uniphix/PycharmProjects/SMP/similarity_data', type=str, # required = True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default='bert-base-chinese', type=str, # required = True, help="choose [bert-base-chinese] mode.") parser.add_argument( "--task_name", default='MyPro', type=str, # required = True, help="The name of the task to train.") parser.add_argument( "--output_dir", default='/users4/xhu/SMP/checkpoints/', #default='/home/uniphix/PycharmProjects/SMP/checkpoints/', type=str, # required = True, help="The output directory where the model checkpoints will be written" ) parser.add_argument( "--model_save_pth", default='/users4/xhu/SMP/checkpoints/bert_classification.pth', #default='/home/uniphix/PycharmProjects/SMP/checkpoints/bert_classification.pth', type=str, # required = True, help="The output directory where the model checkpoints will be written" ) parser.add_argument( "--finetune_save_pth", default='/users4/xhu/SMP/checkpoints_finetune/bert_classification.pth', # default='/home/uniphix/PycharmProjects/SMP/checkpoints/bert_classification.pth', type=str, # required = True, help="The output directory where the model checkpoints will be written" ) # other parameters parser.add_argument("--max_seq_length", default=22, type=int, help="字符串最大长度") parser.add_argument("--do_train", default=False, action='store_true', help="训练模式") parser.add_argument("--do_interact", default=True, action='store_true', help="交互模式") parser.add_argument("--do_eval", default=True, action='store_true', help="验证模式") parser.add_argument("--do_lower_case", default=False, action='store_true', help="英文字符的大小写转换,对于中文来说没啥用") parser.add_argument("--train_batch_size", default=128, type=int, help="训练时batch大小") parser.add_argument("--eval_batch_size", default=1, type=int, help="验证时batch大小") parser.add_argument("--learning_rate", default=5e-5, type=float, help="Adam初始学习步长") parser.add_argument("--num_train_epochs", default=3, type=float, help="训练的epochs次数") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for." "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="用不用CUDA") parser.add_argument("--local_rank", default=-1, type=int, help="local_rank for distributed training on gpus.") parser.add_argument("--seed", default=777, type=int, help="初始化时的随机数种子") parser.add_argument( "--gradient_accumulation_steps", default=1, type=int, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--optimize_on_cpu", default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU." ) parser.add_argument( "--fp16", default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit.") parser.add_argument( "--loss_scale", default=128, type=float, help= "Loss scaling, positive power of 2 values can improve fp16 convergence." ) parser.add_argument("--use_pretrained", default=True, action='store_true', help="是否使用预训练模型") parser.add_argument("--use_noisy", default=True, action='store_true', help="是否使用负例") parser.add_argument("--use_stop_words", default=True, action='store_true', help="是否使用负例") parser.add_argument("--self_fine_tune", default=False, action='store_true', help="是否使用self fine tune") # fixme args = parser.parse_args() print('*' * 80) print(args) print('*' * 80) # 对模型输入进行处理的processor,git上可能都是针对英文的processor processors = {'mypro': MyPro} GPUmanager = GPUManager() which_gpu = GPUmanager.auto_choice() gpu = "cuda:" + str(which_gpu) logger.info('GPU%d Seleted!!!!!!!!!!!!!!!!!!!' % which_gpu) if args.local_rank == -1 or args.no_cuda: device = torch.device( gpu if torch.cuda.is_available() and not args.no_cuda else "cpu") #device = torch.device("cuda:1" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = 1 #n_gpu = torch.cuda.device_count() else: device = torch.device(gpu, args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") # 删除模型文件 # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # shutil.rmtree(args.output_dir) # #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) # os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() #label_list = label_list label_list = [0, 1] # 31 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=len(label_list)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # train global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, show_exp=False) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() best_score = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch #label_ids = torch.tensor([f if f<31 else 0 for f in label_ids], dtype=torch.long).to(device) loss = model(input_ids, segment_ids, input_mask, label_ids) #print ('-------------loss:',loss) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() f1 = val(model, processor, args, label_list, tokenizer, device) if f1 > best_score: best_score = f1 print('*f1 score = {}'.format(f1)) checkpoint = {'state_dict': model.state_dict()} torch.save(checkpoint, args.model_save_pth) else: print('f1 score = {}'.format(f1)) # test if args.use_pretrained: model.load_state_dict(torch.load(args.model_save_pth)['state_dict']) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), num_labels=2) model.to(device) if not args.do_interact: test(model, processor, args, label_list, tokenizer, device) else: interact(model, processor, args, label_list, tokenizer, device) # 用于测试dtp语料随机生成小语料时的F值 print(args) # fixme
if fp16: optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, bias_correction=False, max_grad_norm=1.0) if loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale) warmup_linear = WarmupLinearSchedule( warmup=warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() global_step = 0 nb_tr_steps = 0 tr_loss = 0 for _ in trange(int(num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(
def main(config, model_times, label_list): if not os.path.exists(config.output_dir + model_times): os.makedirs(config.output_dir + model_times) if not os.path.exists(config.cache_dir + model_times): os.makedirs(config.cache_dir + model_times) # Bert 模型输出文件 output_model_file = os.path.join(config.output_dir, model_times, WEIGHTS_NAME) output_config_file = os.path.join(config.output_dir, model_times, CONFIG_NAME) # 设备准备 gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()] device, n_gpu = get_device(gpu_ids[0]) if n_gpu > 1: n_gpu = len(gpu_ids) config.train_batch_size = config.train_batch_size // config.gradient_accumulation_steps # 设定随机种子 random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) # 数据准备 tokenizer = BertTokenizer.from_pretrained( config.bert_vocab_file, do_lower_case=config.do_lower_case) # 分词器选择 num_labels = len(label_list) # Train and dev if config.do_train: train_dataloader, train_examples_len = load_data( config.data_dir, tokenizer, config.max_seq_length, config.train_batch_size, "train", label_list) dev_dataloader, _ = load_data(config.data_dir, tokenizer, config.max_seq_length, config.dev_batch_size, "dev", label_list) num_train_optimization_steps = int( train_examples_len / config.train_batch_size / config.gradient_accumulation_steps) * config.num_train_epochs # 模型准备 print("model name is {}".format(config.model_name)) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertCNN": from BertCNN.BertCNN import BertCNN filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNN.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) elif config.model_name == "BertATT": from BertATT.BertATT import BertATT model = BertATT.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertRCNN": from BertRCNN.BertRCNN import BertRCNN model = BertRCNN.from_pretrained( config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, rnn_hidden_size=config.hidden_size, num_layers=config.num_layers, bidirectional=config.bidirectional, dropout=config.dropout) elif config.model_name == "BertCNNPlus": from BertCNNPlus.BertCNNPlus import BertCNNPlus filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNNPlus.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=gpu_ids) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) train(config.num_train_epochs, n_gpu, model, train_dataloader, dev_dataloader, optimizer, criterion, config.gradient_accumulation_steps, device, label_list, output_model_file, output_config_file, config.log_dir, config.print_step, config.early_stop) """ Test """ # test 数据 test_dataloader, _ = load_data(config.data_dir, tokenizer, config.max_seq_length, config.test_batch_size, "test", label_list) # 加载模型 bert_config = BertConfig(output_config_file) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin(bert_config, num_labels=num_labels) elif config.model_name == "BertCNN": from BertCNN.BertCNN import BertCNN filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNN(bert_config, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) elif config.model_name == "BertATT": from BertATT.BertATT import BertATT model = BertATT(bert_config, num_labels=num_labels) elif config.model_name == "BertRCNN": from BertRCNN.BertRCNN import BertRCNN model = BertRCNN(bert_config, num_labels, config.hidden_size, config.num_layers, config.bidirectional, config.dropout) elif config.model_name == "BertCNNPlus": from BertCNNPlus.BertCNNPlus import BertCNNPlus filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNNPlus(bert_config, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) model.load_state_dict(torch.load(output_model_file)) model.to(device) # 损失函数准备 criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) # test the model test_loss, test_acc, test_report, test_auc, all_idx, all_labels, all_preds = evaluate_save( model, test_dataloader, criterion, device, label_list) print("-------------- Test -------------") print( f'\t Loss: {test_loss: .3f} | Acc: {test_acc*100: .3f} % | AUC:{test_auc}' ) for label in label_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score'])) print_list = ['macro avg', 'weighted avg'] for label in print_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score']))
def run_aug(args, save_every_epoch=False): # Augment the dataset with your own choice of Processer processors = {"toxic": AugProcessor} task_name = args.task_name if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) args.data_dir = os.path.join(args.data_dir, task_name) args.output_dir = os.path.join(args.output_dir, task_name) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) os.makedirs(args.output_dir, exist_ok=True) processor = processors[task_name]() label_list = processor.get_labels(task_name) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) model = BertForMaskedLM.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) model.cuda() # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_masked_lm_labels = torch.tensor( [f.masked_lm_labels for f in train_features], dtype=torch.long) train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels) print(train_data) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name) if not os.path.exists(save_model_dir): os.mkdir(save_model_dir) for e in trange(int(args.num_train_epochs), desc="Epoch"): avg_loss = 0. for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) _, input_ids, input_mask, segment_ids, masked_ids = batch loss = model(input_ids, segment_ids, input_mask, masked_ids) loss.backward() avg_loss += loss.item() optimizer.step() model.zero_grad() if (step + 1) % 50 == 0: print("avg_loss: {}".format(avg_loss / 50)) avg_loss = 0 if save_every_epoch: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path) else: if (e + 1) % 10 == 0: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path)
from apex.optimizers import FusedAdam except ImportError: raise ImportError('please install apex') optimizer = FusedAdam(optimizer_grouped_parameters, lr=getattr(args, 'lr'), bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) loss_fct = CrossEntropyLoss() # train global_step = 0 last_val_loss = 100 epochs = getattr(args, 'num_train_epochs') for i in range(1, epochs + 1): training_loss = 0 model.train() for step, batch in enumerate( tqdm(train_dataloader, desc='train', total=len(train_dataloader))): if torch.cuda.is_available(): batch = tuple(item.cuda() for item in batch) input_ids, input_mask, segment_ids, label_ids = batch
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument( "--test_set", default='story', type=str, #choices=['story', 'news', 'chat', 'train'], help="Choose the test set.") parser.add_argument("--no_logit_mask", action='store_true', help="Whether not to use logit mask") parser.add_argument("--eval_every_epoch", action='store_true', help="Whether to evaluate for every epoch") parser.add_argument("--use_weight", action='store_true', help="Whether to use class-balancing weight") parser.add_argument("--hybrid_attention", action='store_true', help="Whether to use hybrid attention") parser.add_argument( "--state_dir", default="", type=str, help= "Where to load state dict instead of using Google pre-trained model") parser.add_argument( '--ratio', type=float, default=0.9, help="softmax target for the target label, 1-ratio for the abbreviation" ) args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` or 'do_test' must be True." ) processor = DataProcessor() label_list = processor.get_labels(args.data_dir) abex = processor.get_abex(args.data_dir) num_labels = len(label_list) logger.info("num_labels:" + str(num_labels)) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = len( train_examples ) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps / torch.distributed.get_world_size( ) num_train_optimization_steps = math.ceil(num_train_optimization_steps) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) max_epoch = -1 if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) files = os.listdir(args.output_dir) for fname in files: if re.search(WEIGHTS_NAME, fname) and fname != WEIGHTS_NAME: max_epoch = max(max_epoch, int(fname.split('_')[-1])) if os.path.exists( os.path.join(args.output_dir, WEIGHTS_NAME + '_' + str(max_epoch))): output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME + '_' + str(max_epoch)) output_config_file = os.path.join(args.output_dir, CONFIG_NAME + '_0') config = BertConfig(output_config_file) model = BertForAbbr(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: raise ValueError( "Output directory ({}) already exists but no model checkpoint was found." .format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) if args.state_dir and os.path.exists(args.state_dir): state_dict = torch.load(args.state_dir) if isinstance(state_dict, dict) or isinstance( state_dict, collections.OrderedDict): assert 'model' in state_dict state_dict = state_dict['model'] print("Using my own BERT state dict.") elif args.state_dir and not os.path.exists(args.state_dir): print( "Warning: the state dict does not exist, using the Google pre-trained model instead." ) state_dict = None else: state_dict = None model = BertForAbbr.from_pretrained(args.bert_model, cache_dir=cache_dir, state_dict=state_dict, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if os.path.exists( os.path.join(args.output_dir, OPTIMIZER_NAME + '_' + str(max_epoch))): output_optimizer_file = os.path.join( args.output_dir, OPTIMIZER_NAME + '_' + str(max_epoch)) optimizer.load_state_dict(torch.load(output_optimizer_file)) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features, masks, weight = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, abex, args.ratio) if args.eval_every_epoch: eval_examples = processor.get_dev_examples(args.data_dir) eval_features, masks, weight = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, abex, args.ratio) if args.no_logit_mask: print("Remove logit mask") masks = None if not args.use_weight: weight = None hybrid_mask = None writer = SummaryWriter(log_dir=os.environ['HOME']) tag = str(int(time.time())) print(weight) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_label_poss = torch.tensor([f.label_pos for f in train_features], dtype=torch.long) all_targets = torch.tensor([f.targets for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss, all_targets) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, label_ids, label_poss, targets = batch # print(masks.size()) loss = model(input_ids, input_mask, label_ids, logit_masks=masks, weight=weight, hybrid_mask=None, targets=targets) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 writer.add_scalar('data/loss' + tag, loss.item(), global_step) logger.info(f'Trainging loss: {tr_loss/nb_tr_steps}') if args.eval_every_epoch: # evaluate for every epoch # save model and load for a single GPU model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME + '_' + str(ep)) torch.save(model_to_save.state_dict(), output_model_file) output_optimizer_file = os.path.join( args.output_dir, OPTIMIZER_NAME + '_' + str(ep)) torch.save(optimizer.state_dict(), output_optimizer_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME + '_' + str(ep)) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model_eval = BertForAbbr(config, num_labels=num_labels) model_eval.load_state_dict(torch.load(output_model_file)) model_eval.to(device) if args.hybrid_attention: hybrid_mask = hybrid_mask.to(device) else: hybrid_mask = None if args.no_logit_mask: print("Remove logit mask") masks = None else: masks = masks.to(device) chars = [f.char for f in eval_features] print(len(set(chars)), sorted(list(set(chars)))) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) all_label_poss = torch.tensor( [f.label_pos for f in eval_features], dtype=torch.long) all_targets = torch.tensor([f.targets for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss, all_targets) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model_eval.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 res_list = [] for input_ids, input_mask, label_ids, label_poss, targets in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) label_poss = label_poss.to(device) targets = targets.to(device) with torch.no_grad(): tmp_eval_loss = model_eval(input_ids, input_mask, label_ids, logit_masks=masks, hybrid_mask=None, targets=targets) logits = model_eval(input_ids, input_mask, label_ids, logit_masks=masks, cal_loss=False, hybrid_mask=None, targets=targets) # print(logits.size()) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() res_list += accuracy_list(logits, label_ids, label_poss) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None acc = sum(res_list) / len(res_list) char_count = {k: [] for k in list(set(chars))} for i, c in enumerate(chars): char_count[c].append(res_list[i]) char_acc = { k: sum(char_count[k]) / len(char_count[k]) for k in char_count } result = { 'epoch': ep + 1, 'eval_loss': eval_loss, 'eval_accuracy': acc, 'global_step': global_step, 'loss': loss } logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) output_eval_file = os.path.join( args.output_dir, "epoch_" + str(ep + 1) + ".txt") with open(output_eval_file, 'w') as f: f.write( json.dumps(result, ensure_ascii=False) + '\n' + json.dumps(char_acc, ensure_ascii=False)) # multi processing # if n_gpu > 1: # model = torch.nn.DataParallel(model) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_optimizer_file = os.path.join(args.output_dir, OPTIMIZER_NAME) torch.save(optimizer.state_dict(), output_optimizer_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForAbbr(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: # model = BertForPolyphonyMulti.from_pretrained(args.bert_model, num_labels = num_labels) pass model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForAbbr(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) eval_examples = processor.get_dev_examples(args.data_dir) eval_features, masks, weight = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, abex, args.ratio) hybrid_mask = None if args.no_logit_mask: print("Remove logit mask") masks = None else: masks = masks.to(device) chars = [f.char for f in eval_features] print(len(set(chars)), sorted(list(set(chars)))) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_label_poss = torch.tensor([f.label_pos for f in eval_features], dtype=torch.long) all_targets = torch.tensor([f.targets for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss, all_targets) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 res_list = [] # masks = masks.to(device) for input_ids, input_mask, label_ids, label_poss, targets in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) label_poss = label_poss.to(device) targets = targets.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, input_mask, label_ids, logit_masks=masks, hybrid_mask=None, targets=targets) logits = model(input_ids, input_mask, label_ids, logit_masks=masks, cal_loss=False, hybrid_mask=None, targets=targets) # print(logits.size()) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) res_list += accuracy_list(logits, label_ids, label_poss) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None acc = sum(res_list) / len(res_list) char_count = {k: [] for k in list(set(chars))} for i, c in enumerate(chars): char_count[c].append(res_list[i]) char_acc = { k: sum(char_count[k]) / len(char_count[k]) for k in char_count } result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss, 'acc': acc } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) for key in sorted(char_acc.keys()): logger.info(" %s = %s", key, str(char_acc[key])) writer.write("%s = %s\n" % (key, str(char_acc[key]))) print("mean accuracy", sum(char_acc[c] for c in char_acc) / len(char_acc)) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForAbbr(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) eval_examples = processor.get_test_examples(args.data_dir) eval_features, masks, weight = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, abex, args.ratio, is_test=True) hybrid_mask = None if args.no_logit_mask: print("Remove logit mask") masks = None else: masks = masks.to(device) chars = [f.char for f in eval_features] print(len(set(chars)), sorted(list(set(chars)))) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_label_poss = torch.tensor([f.label_pos for f in eval_features], dtype=torch.long) all_targets = torch.tensor([f.targets for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids, all_label_poss, all_targets) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 res_list = [] out_list = [] # masks = masks.to(device) for input_ids, input_mask, label_ids, label_poss, targets in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) label_poss = label_poss.to(device) targets = targets.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, input_mask, label_ids, logit_masks=masks, hybrid_mask=None, targets=targets) logits = model(input_ids, input_mask, label_ids, logit_masks=masks, cal_loss=False, hybrid_mask=None, targets=targets) # print(logits.size()) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) tmp_out, tmp_rlist = result_list(logits, label_ids, label_poss, label_list) #res_list += accuracy_list(logits, label_ids, label_poss) res_list += tmp_rlist out_list += tmp_out eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None acc = sum(res_list) / len(res_list) char_count = {k: [] for k in list(set(chars))} for i, c in enumerate(chars): char_count[c].append(res_list[i]) char_acc = { k: sum(char_count[k]) / len(char_count[k]) for k in char_count } result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss, 'acc': acc } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) for key in sorted(char_acc.keys()): logger.info(" %s = %s", key, str(char_acc[key])) writer.write("%s = %s\n" % (key, str(char_acc[key]))) print("mean accuracy", sum(char_acc[c] for c in char_acc) / len(char_acc)) output_acc_file = os.path.join(args.output_dir, "res.json") output_reslist_file = os.path.join(args.output_dir, "outlist.json") with open(output_acc_file, "w") as f: f.write(json.dumps(char_acc, ensure_ascii=False, indent=2)) with open(output_reslist_file, "w") as f: f.write(json.dumps(out_list, ensure_ascii=False, indent=2))
def main(): parser = argparse.ArgumentParser() # General parser.add_argument( "--bert_model", default="bert-base-cased", type=str, help= "Bert pre-trained model selected in the list: bert-base-cased, bert-large-cased." ) parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--output_dir", default='tmp', type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--log_file", default="training.log", type=str, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument( "--do_train", action='store_true', help="Whether to run training. This should ALWAYS be set to True.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=30, type=int, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--global_rank", type=int, default=-1, help="global_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp32_embedding', action='store_true', help= "Whether to use 32-bit float precision instead of 32-bit for embeddings" ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument( '--from_scratch', action='store_true', help= "Initialize parameters with random values (i.e., training from scratch)." ) parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument( '--max_len_a', type=int, default=0, help="Truncate_config: maximum length of segment A. 0 means none.") parser.add_argument('--max_len_b', type=int, default=20, help="Truncate_config: maximum length of segment B.") parser.add_argument( '--trunc_seg', default='b', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument( '--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument( "--mask_prob", default=0.15, type=float, help= "Number of prediction is sometimes less than max_pred when sequence is short." ) parser.add_argument('--max_pred', type=int, default=3, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=4, type=int, help="Number of workers for the data loader.") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") # Others for VLP parser.add_argument( "--src_file", default=['/mnt/dat/COCO/annotations/dataset_coco.json'], type=str, nargs='+', help="The input data file name.") parser.add_argument('--len_vis_input', type=int, default=100) parser.add_argument('--enable_visdom', action='store_true') parser.add_argument('--visdom_port', type=int, default=8888) # parser.add_argument('--resnet_model', type=str, default='imagenet_weights/resnet101.pth') parser.add_argument('--image_root', type=str, default='/mnt/dat/COCO/images') parser.add_argument('--dataset', default='coco', type=str, help='coco | flickr30k | cc') parser.add_argument('--split', type=str, nargs='+', default=['train', 'restval']) parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') parser.add_argument('--dist_url', default='file://[PT_OUTPUT_DIR]/nonexistent_file', type=str, help='url used to set up distributed training') parser.add_argument( '--file_valid_jpgs', default='/mnt/dat/COCO/annotations/coco_valid_jpgs.json', type=str) parser.add_argument('--sche_mode', default='warmup_linear', type=str, help="warmup_linear | warmup_constant | warmup_cosine") parser.add_argument('--drop_prob', default=0.1, type=float) parser.add_argument('--use_num_imgs', default=-1, type=int) parser.add_argument('--vis_mask_prob', default=0, type=float) parser.add_argument('--max_drop_worst_ratio', default=0, type=float) parser.add_argument('--drop_after', default=6, type=int) parser.add_argument( '--s2s_prob', default=1, type=float, help="Percentage of examples that are bi-uni-directional LM (seq2seq)." ) parser.add_argument( '--bi_prob', default=0, type=float, help="Percentage of examples that are bidirectional LM.") parser.add_argument( '--l2r_prob', default=0, type=float, help= "Percentage of examples that are unidirectional (left-to-right) LM.") parser.add_argument('--enable_butd', action='store_true', help='set to take in region features') parser.add_argument( '--region_bbox_file', default= 'coco_detection_vg_thresh0.2_feat_gvd_checkpoint_trainvaltest.h5', type=str) parser.add_argument( '--region_det_file_prefix', default= 'feat_cls_1000/coco_detection_vg_100dets_gvd_checkpoint_trainval', type=str) parser.add_argument('--tasks', default='img2txt', help='img2txt | vqa2') parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--scst', action='store_true', help='Self-critical sequence training') args = parser.parse_args() print('global_rank: {}, local rank: {}'.format(args.global_rank, args.local_rank)) args.max_seq_length = args.max_len_b + args.len_vis_input + 3 # +3 for 2x[SEP] and [CLS] args.mask_image_regions = (args.vis_mask_prob > 0 ) # whether to mask out image regions args.dist_url = args.dist_url.replace('[PT_OUTPUT_DIR]', args.output_dir) # arguments inspection assert (args.tasks in ('img2txt', 'vqa2')) assert args.enable_butd == True, 'only support region attn! featmap attn deprecated' assert ( not args.scst) or args.dataset == 'coco', 'scst support on coco only!' if args.scst: assert args.dataset == 'coco', 'scst support on coco only!' assert args.max_pred == 0 and args.mask_prob == 0, 'no mask for scst!' rl_crit = RewardCriterion() if args.enable_butd: assert (args.len_vis_input == 100) args.region_bbox_file = os.path.join(args.image_root, args.region_bbox_file) args.region_det_file_prefix = os.path.join( args.image_root, args.region_det_file_prefix) if args.dataset in ( 'cc', 'coco') and args.region_det_file_prefix != '' else '' # output config os.makedirs(args.output_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join(args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) logging.basicConfig( filename=os.path.join(args.output_dir, args.log_file), filemode='w', format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', init_method=args.dist_url, world_size=args.world_size, rank=args.global_rank) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # plotting loss, optional if args.enable_visdom: import visdom vis = visdom.Visdom(port=args.visdom_port, env=args.output_dir) vis_window = {'iter': None, 'score': None} tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank)) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer( ) if args.tokenized_input else tokenizer if args.do_train: bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_image_regions=args.mask_image_regions, mode="s2s", len_vis_input=args.len_vis_input, vis_mask_prob=args.vis_mask_prob, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, local_rank=args.local_rank, load_vqa_ann=(args.tasks == 'vqa2')) ] bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_image_regions=args.mask_image_regions, mode="bi", len_vis_input=args.len_vis_input, vis_mask_prob=args.vis_mask_prob, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, local_rank=args.local_rank, load_vqa_ann=(args.tasks == 'vqa2'))) bi_uni_pipeline.append( seq2seq_loader.Preprocess4Seq2seq( args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail }, mask_image_regions=args.mask_image_regions, mode="l2r", len_vis_input=args.len_vis_input, vis_mask_prob=args.vis_mask_prob, enable_butd=args.enable_butd, region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, local_rank=args.local_rank, load_vqa_ann=(args.tasks == 'vqa2'))) train_dataset = seq2seq_loader.Img2txtDataset( args.src_file, args.image_root, args.split, args.train_batch_size, data_tokenizer, args.max_seq_length, file_valid_jpgs=args.file_valid_jpgs, bi_uni_pipeline=bi_uni_pipeline, use_num_imgs=args.use_num_imgs, s2s_prob=args.s2s_prob, bi_prob=args.bi_prob, l2r_prob=args.l2r_prob, enable_butd=args.enable_butd, tasks=args.tasks) if args.world_size == 1: train_sampler = RandomSampler(train_dataset, replacement=False) else: train_sampler = DistributedSampler(train_dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=batch_list_to_batch_tensors, pin_memory=True) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) t_total = int( len(train_dataloader) * args.num_train_epochs * 1. / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 if args.new_segment_ids else 2 relax_projection = 4 if args.relax_projection else 0 task_idx_proj = 3 if args.tasks == 'img2txt' else 0 mask_word_id, eos_word_ids, pad_word_ids = tokenizer.convert_tokens_to_ids( ["[MASK]", "[SEP]", "[PAD]"]) # index in BERT vocab: 103, 102, 0 if (recover_step is None) and (args.model_recover_path is None): # if _state_dict == {}, the parameters are randomly initialized # if _state_dict == None, the parameters are initialized with bert-init assert args.scst == False, 'must init from maximum likelihood training' _state_dict = {} if args.from_scratch else None model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, tasks=args.tasks) global_step = 0 else: if recover_step: logger.info("***** Recover model: %d *****", recover_step) model_recover = torch.load( os.path.join(args.output_dir, "model.{0}.bin".format(recover_step))) # recover_step == number of epochs global_step = math.floor(recover_step * t_total * 1. / args.num_train_epochs) elif args.model_recover_path: logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load(args.model_recover_path) global_step = 0 if not args.scst: model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, relax_projection=relax_projection, config_path=args.config_path, task_idx=task_idx_proj, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(args.global_rank), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, tasks=args.tasks) else: model = BertForSeq2SeqDecoder.from_pretrained( args.bert_model, max_position_embeddings=args.max_position_embeddings, config_path=args.config_path, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, task_idx=task_idx_proj, mask_word_id=mask_word_id, search_beam_size=1, eos_id=eos_word_ids, mode='s2s', enable_butd=args.enable_butd, len_vis_input=args.len_vis_input) del model_recover torch.cuda.empty_cache() # deprecated # from vlp.resnet import resnet # cnn = resnet(args.resnet_model, _num_layers=101, _fixed_block=4, pretrained=True) # no finetuning if args.fp16: model.half() # cnn.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) # cnn.to(device) if args.local_rank != -1: try: # from apex.parallel import DistributedDataParallel as DDP from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # cnn = DDP(cnn) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelImbalance(model) # cnn = DataParallelImbalance(cnn) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from pytorch_pretrained_bert.optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, schedule=args.sche_mode, t_total=t_total) if recover_step: logger.info("***** Recover optimizer: %d *****", recover_step) optim_recover = torch.load( os.path.join(args.output_dir, "optim.{0}.bin".format(recover_step))) if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info("***** Recover optimizer: dynamic_loss_scale *****") optimizer.dynamic_loss_scale = True logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) logger.info(" Loader length = %d", len(train_dataloader)) model.train() if recover_step: start_epoch = recover_step + 1 else: start_epoch = 1 for i_epoch in trange(start_epoch, args.num_train_epochs + 1, desc="Epoch"): if args.local_rank >= 0: train_sampler.set_epoch(i_epoch - 1) iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') nbatches = len(train_dataloader) train_loss = [] pretext_loss = [] vqa2_loss = [] scst_reward = [] for step, batch in enumerate(iter_bar): batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, img, vis_masked_pos, vis_pe, ans_labels = batch if args.fp16: img = img.half() vis_pe = vis_pe.half() if args.enable_butd: conv_feats = img.data # Bx100x2048 vis_pe = vis_pe.data else: conv_feats, _ = cnn(img.data) # Bx2048x7x7 conv_feats = conv_feats.view(conv_feats.size(0), conv_feats.size(1), -1).permute(0, 2, 1).contiguous() if not args.scst: loss_tuple = model( conv_feats, vis_pe, input_ids, segment_ids, input_mask, lm_label_ids, ans_labels, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, vis_masked_pos=vis_masked_pos, mask_image_regions=args.mask_image_regions, drop_worst_ratio=args.max_drop_worst_ratio if i_epoch > args.drop_after else 0) mean_reward = loss_tuple[0].new(1).fill_(0) else: # scst training model.eval() position_ids = torch.arange( input_ids.size(1), dtype=input_ids.dtype, device=input_ids.device).unsqueeze(0).expand_as( input_ids) input_dummy = input_ids[:, :args.len_vis_input + 2] # +2 for [CLS] and [SEP] greedy_res = input_ids.new( input_ids.size(0), input_ids.size(1) - args.len_vis_input - 2).fill_(0) gen_result = input_ids.new( input_ids.size(0), input_ids.size(1) - args.len_vis_input - 2).fill_(0) with torch.no_grad(): greedy_res_raw, _ = model(conv_feats, vis_pe, input_dummy, segment_ids, position_ids, input_mask, task_idx=task_idx, sample_mode='greedy') for b in range(greedy_res_raw.size(0)): for idx in range(greedy_res_raw.size(1)): if greedy_res_raw[b][idx] not in [ eos_word_ids, pad_word_ids ]: greedy_res[b][idx] = greedy_res_raw[b][idx] else: if greedy_res_raw[b][idx] == eos_word_ids: greedy_res[b][idx] = eos_word_ids break model.train() gen_result_raw, sample_logprobs = model( conv_feats, vis_pe, input_dummy, segment_ids, position_ids, input_mask, task_idx=task_idx, sample_mode='sample') for b in range(gen_result_raw.size(0)): for idx in range(gen_result_raw.size(1)): if gen_result_raw[b][idx] not in [ eos_word_ids, pad_word_ids ]: gen_result[b][idx] = gen_result_raw[b][idx] else: if gen_result_raw[b][idx] == eos_word_ids: gen_result[b][idx] = eos_word_ids break gt_ids = input_ids[:, args.len_vis_input + 2:] reward = get_self_critical_reward(greedy_res, gt_ids, gen_result, gt_ids.size(0)) reward = torch.from_numpy(reward).float().to( gen_result.device) mean_reward = reward.mean() loss = rl_crit(sample_logprobs, gen_result.data, reward) loss_tuple = [ loss, loss.new(1).fill_(0.), loss.new(1).fill_(0.) ] # disable pretext_loss_deprecated for now masked_lm_loss, pretext_loss_deprecated, ans_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. For dist, this is done through gradient addition. masked_lm_loss = masked_lm_loss.mean() pretext_loss_deprecated = pretext_loss_deprecated.mean() vqa2_loss = ans_loss.mean() loss = masked_lm_loss + pretext_loss_deprecated + ans_loss # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) train_loss.append(loss.item()) pretext_loss.append(pretext_loss_deprecated.item()) vqa2_loss.append(ans_loss.item()) scst_reward.append(mean_reward.item()) if step % 100 == 0: logger.info( "Epoch {}, Iter {}, Loss {:.2f}, Pretext {:.2f}, VQA2 {:.2f}, Mean R {:.3f}\n" .format(i_epoch, step, np.mean(train_loss), np.mean(pretext_loss), np.mean(vqa2_loss), np.mean(scst_reward))) if args.enable_visdom: if vis_window['iter'] is None: vis_window['iter'] = vis.line( X=np.tile( np.arange((i_epoch - 1) * nbatches + step, (i_epoch - 1) * nbatches + step + 1), (1, 1)).T, Y=np.column_stack( (np.asarray([np.mean(train_loss)]), )), opts=dict(title='Training Loss', xlabel='Training Iteration', ylabel='Loss', legend=['total'])) else: vis.line(X=np.tile( np.arange((i_epoch - 1) * nbatches + step, (i_epoch - 1) * nbatches + step + 1), (1, 1)).T, Y=np.column_stack( (np.asarray([np.mean(train_loss)]), )), opts=dict(title='Training Loss', xlabel='Training Iteration', ylabel='Loss', legend=['total']), win=vis_window['iter'], update='append') # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step/t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "model.{0}.bin".format(i_epoch)) output_optim_file = os.path.join(args.output_dir, "optim.{0}.bin".format(i_epoch)) if args.global_rank in ( -1, 0): # save model if the first device or no dist torch.save( copy.deepcopy(model_to_save).cpu().state_dict(), output_model_file) # torch.save(optimizer.state_dict(), output_optim_file) # disable for now, need to sanitize state and ship everthing back to cpu logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.world_size > 1: torch.distributed.barrier()
discriminator.to(device, non_blocking=True) param_optimizer = list(discriminator.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] opt = BertAdam(optimizer_grouped_parameters, lr=Lr, warmup=0.1, t_total=Epoch * TrainSet.TtrainNum) bst = 0.0 def test(e, dataset): discriminator.eval() preds = [] labels = [] for words, inMask, maskL, maskR, label in dataset.batchs(): words, inMask, maskL, maskR, label = words.to(device), inMask.to( device), maskL.to(device), maskR.to(device), label.to(device) loss, scores, pred = discriminator(words, inMask, maskL, maskR, label) preds.append(pred[1].cpu().numpy()) labels.append(label.cpu().numpy())
def main(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) processor = DataProcessor() label_list = processor.get_labels(args.data_dir, args.negative_label) label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) special_tokens = {} if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode) logger.info("***** Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) eval_label_ids = all_label_ids if args.do_train: train_examples = processor.get_train_examples(args.data_dir) train_features = convert_examples_to_features(train_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs logger.info("***** Training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_result = None eval_step = max(1, len(train_batches) // args.eval_per_epoch) lrs = [args.learning_rate] if args.learning_rate else \ [1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5] for lr in lrs: model = BertForSequenceClassification.from_pretrained( args.model, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE), num_labels=num_labels) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex" "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) start_time = time.time() global_step = 0 tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 for epoch in range(int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) if args.train_mode == 'random' or args.train_mode == 'random_sorted': random.shuffle(train_batches) for step, batch in enumerate(train_batches): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = lr * \ warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % eval_step == 0: logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_batches), time.time() - start_time, tr_loss / nb_tr_steps)) save_model = False if args.do_eval: preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels) model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size logger.info("First 20 predictions:") for pred, label in zip( preds[:20], eval_label_ids.numpy()[:20]): sign = u'\u2713' if pred == label else u'\u2718' logger.info( "pred = %s, label = %s %s" % (id2label[pred], id2label[label], sign)) if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]): best_result = result save_model = True logger.info( "!!! Best dev %s (lr=%s, epoch=%d): %.2f" % (args.eval_metric, str(lr), epoch, result[args.eval_metric] * 100.0)) else: save_model = True if save_model: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file( output_config_file) tokenizer.save_vocabulary(args.output_dir) if best_result: output_eval_file = os.path.join( args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_eval: if args.eval_test: eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode) logger.info("***** Test *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) eval_label_ids = all_label_ids model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels) with open(os.path.join(args.output_dir, "predictions.txt"), "w") as f: for ex, pred in zip(eval_examples, preds): f.write("%s\t%s\n" % (ex.guid, id2label[pred])) with open(os.path.join(args.output_dir, "test_results.txt"), "w") as f: for key in sorted(result.keys()): f.write("%s = %s\n" % (key, str(result[key])))
def main(): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer(vocab_file=args.vocab_file) train_examples = None num_train_optimization_steps = None vocab_list = [] with open(args.vocab_file, 'r') as fr: for line in fr: vocab_list.append(line.strip("\n")) if args.do_train: train_examples = create_examples( data_path=args.pretrain_train_path, max_seq_length=args.max_seq_length, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, vocab_list=vocab_list) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) model = BertForMaskedLM( config=BertConfig.from_json_file(args.bert_config_json)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 best_loss = 100000 if args.do_train: train_features = convert_examples_to_features(train_examples, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for e in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # masked_lm_loss loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if nb_tr_steps > 0 and nb_tr_steps % 100 == 0: logger.info( "===================== -epoch %d -train_step %d -train_loss %.4f\n" % (e, nb_tr_steps, tr_loss / nb_tr_steps)) if nb_tr_steps > 0 and nb_tr_steps % 2000 == 0: eval_examples = create_examples( data_path=args.pretrain_dev_path, max_seq_length=args.max_seq_length, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, vocab_list=vocab_list) eval_features = convert_examples_to_features( eval_examples, args.max_seq_length, tokenizer) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): loss = model(input_ids, segment_ids, input_mask, label_ids) eval_loss += loss.item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps if eval_loss < best_loss: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) best_loss = eval_loss logger.info( "============================ -epoch %d -train_loss %.4f -eval_loss %.4f\n" % (e, tr_loss / nb_tr_steps, eval_loss))
def main(): parser = argparse.ArgumentParser() # arguments parser.add_argument("--input_dir", default="./data/", type=str, help="The input data dir." ) parser.add_argument("--output_dir", default="./ss/tmp/", type=str, help="The output dir where the model predictions will be stored." ) parser.add_argument("--checkpoints_dir", default="./ss/checkpoints/", type=str, help="Where checkpoints will be stored." ) parser.add_argument("--cache_dir", default="./data/models/", type=str, help="Where do you want to store the pre-trained models" "downloaded from pytorch pretrained model." ) parser.add_argument("--batchsize", default=4, type=int, help="Batch size for (positive) training examples." ) parser.add_argument("--negative_batchsize", default=4, type=int, help="Batch size for (negative) training examples." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate." ) parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--num_train_epochs", default=5, type=int, help="Total number of training epochs." ) parser.add_argument("--seed", default=42, type=int, help="random seed." ) parser.add_argument("--max_length", default=512, type=int, help="The maximum total input sequence length after tokenized." "If longer than this, it will be truncated, else will be padded." ) args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.DEBUG) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(args.checkpoints_dir): os.makedirs(args.checkpoints_dir) if not os.path.exists(args.cache_dir): os.makedirs(args.cache_dir) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) num_labels = 2 criterion = CrossEntropyLoss() # Make sure to pass do_lower_case=False when use multilingual-cased model. # See https://github.com/google-research/bert/blob/master/multilingual.md tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False) model = build_ss_model(args.cache_dir, num_labels) # prepare the dataset train_dataset = get_dataset(args.input_dir, 'train') val_dataset = get_dataset(args.input_dir, 'test') # convert dataset into BERT's input formats train_examples_pos, train_examples_neg = split_pos_neg_examples(train_dataset) train_features = convert_train_dataset( train_examples_pos, tokenizer, args.max_length ) train_features_neg = convert_train_dataset( train_examples_neg, tokenizer, args.max_length ) val_features = convert_valid_dataset( val_dataset, tokenizer, args.max_length ) # prepare optimizer num_train_optimization_steps = int(len(train_examples_pos) / args.batchsize) * args.num_train_epochs optimizer = BertAdam(model.parameters(), lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) model.to(device) global_step = 0 # TRAINING !! logger.info("=== Running training ===") logger.info("===== Num (pos) examples : %d", len(train_examples_pos)) logger.info("===== Batch size : %d", args.batchsize) logger.info("===== Num steps : %d", num_train_optimization_steps) # prepare positive/negative train dataset all_input_ids = torch.LongTensor([x['input_ids'] for x in train_features]) all_segment_ids = torch.LongTensor([x['segment_ids'] for x in train_features]) all_input_mask = torch.LongTensor([x['input_mask'] for x in train_features]) all_label = torch.LongTensor([x['label'] for x in train_features]) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) all_input_ids_neg = torch.LongTensor([x['input_ids'] for x in train_features_neg]) all_segment_ids_neg = torch.LongTensor([x['segment_ids'] for x in train_features_neg]) all_input_mask_neg = torch.LongTensor([x['input_mask'] for x in train_features_neg]) all_label_neg = torch.LongTensor([x['label'] for x in train_features_neg]) train_data_neg = TensorDataset(all_input_ids_neg, all_input_mask_neg, all_segment_ids_neg, all_label_neg) train_sampler = RandomSampler(train_data) train_sampler_neg = RandomSampler(train_data_neg) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.batchsize, drop_last=True ) negative_dataloader = DataLoader( train_data_neg, sampler=train_sampler_neg, batch_size=args.negative_batchsize, drop_last=True ) # training max_acc = 0 for epoch in range(int(args.num_train_epochs)): model.train() tr_loss, num_tr_examples, num_tr_steps = 0, 0, 0 temp_tr_loss, temp_num_tr_exs, temp_num_tr_steps = 0, 0, 0 it = iter(negative_dataloader) for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) batch_neg = tuple(t.to(device) for t in next(it)) input_ids, input_mask, segment_ids, labels = batch input_ids_neg, input_mask_neg, segment_ids_neg, labels_neg = batch_neg # batchify input_ids_cat=torch.cat([input_ids, input_ids_neg],dim=0) segment_ids_cat=torch.cat([segment_ids, segment_ids_neg],dim=0) input_mask_cat=torch.cat([input_mask,input_mask_neg],dim=0) label_ids_cat=torch.cat([labels.view(-1), labels_neg.view(-1)], dim = 0) model.zero_grad() # compute loss and backpropagate loss, logits = model( input_ids_cat, token_type_ids=segment_ids_cat, attention_mask=input_mask_cat, labels=label_ids_cat ) loss.backward() clip_grad_norm_(model.parameters(), 1.0) optimizer.step() global_step += 1 tr_loss += loss.item() num_tr_examples += input_ids.size(0) num_tr_steps += 1 # logging every 0.05 epoch temp_tr_loss += loss.item() temp_num_tr_exs += input_ids.size(0) temp_num_tr_steps += 1 if (step + 1) % (len(train_dataloader) // 20) == 0: logger.info("Epoch %d/%d - step %d/%d" % ((epoch+1), args.num_train_epochs, step, len(train_dataloader))) logger.info("# of examples %d" % temp_num_tr_exs) logger.info("temp loss %f" % (temp_tr_loss / temp_num_tr_steps)) temp_tr_loss, temp_num_tr_exs, temp_num_tr_steps = 0, 0, 0 # logging every 1 epoch print('===== Epoch %d done.' % (epoch+1)) print('===== Average training loss', tr_loss / num_tr_steps) # validate every 1 epoch logger.info("=== Running validation ===") model.eval() eval_loss, eval_acc, eval_r5 = 0, 0, 0 for example in tqdm(val_features, desc="Iteration"): input_ids = torch.LongTensor(example['input_ids']).to(device) segment_ids = torch.LongTensor(example['segment_ids']).to(device) input_mask = torch.LongTensor(example['input_mask']).to(device) label = torch.LongTensor(example['label']).to(device) with torch.no_grad(): loss, logits = model( input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label ) eval_loss += loss.item() temp_acc, temp_r5 = calculate_metric(logits, label) eval_acc += temp_acc eval_r5 += temp_r5 eval_acc_ = eval_acc / len(val_features) if max_acc < eval_acc_ : max_acc = eval_acc_ torch.save({'epoch': epoch + 1, 'model_state': model.state_dict(), 'optimizer_state' : optimizer.state_dict()}, os.path.join(args.checkpoints_dir, 'best_ckpt.pth')) # logging validation results print('===== Validation loss', eval_loss / len(val_features)) print('===== Validation accuracy', eval_acc / len(val_features)) print('===== Validation R@5', eval_r5 / len(val_features))
def main(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = \ args.train_batch_size // args.gradient_accumulation_steps if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if args.do_train: assert (args.train_file is not None) and (args.dev_file is not None) if args.eval_test: assert args.test_file is not None else: assert args.dev_file is not None if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) if args.do_train or (not args.eval_test): # with gzip.GzipFile(args.test_file, 'r') as reader: with open(args.dev_file, 'r', encoding='utf-8') as f: # skip header # content = reader.read().decode('utf-8').strip().split('\n')[1:] # input_data = [json.loads(line) for line in content] content = f.read().strip().split('\n') eval_dataset = [json.loads(line) for line in content] eval_examples = read_mrqa_examples(input_file=args.dev_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Dev *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) if args.do_train: train_examples = read_mrqa_examples(input_file=args.train_file, is_training=True) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs logger.info("***** Train *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) eval_step = max(1, len(train_batches) // args.eval_per_epoch) best_result = None lrs = [args.learning_rate] if args.learning_rate else [ 1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5 ] for lr in lrs: model = None if not args.finetuning_dir is None: # load model model = BertForQuestionAnswering.from_pretrained( args.finetuning_dir) if args.fp16: model.half() model.to(device) else: model = BertForQuestionAnswering.from_pretrained( args.model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) param_optimizer = [ n for n in param_optimizer if 'pooler' not in n[0] ] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex" "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 global_step = 0 start_time = time.time() for epoch in range(int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) for step, batch in enumerate(train_batches): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = lr * \ warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % eval_step == 0: logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_dataloader), time.time() - start_time, tr_loss / nb_tr_steps)) save_model = False if args.do_eval: result, _, _ = \ evaluate(args, model, device, eval_dataset, eval_dataloader, eval_examples, eval_features) model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]): best_result = result save_model = True logger.info( "!!! Best dev %s (lr=%s, epoch=%d): %.2f" % (args.eval_metric, str(lr), epoch, result[args.eval_metric])) else: save_model = True if save_model: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file( output_config_file) tokenizer.save_vocabulary(args.output_dir) if best_result: with open( os.path.join(args.output_dir, EVAL_FILE), "w") as writer: for key in sorted(best_result.keys()): writer.write( "%s = %s\n" % (key, str(best_result[key]))) if args.do_eval: if args.eval_test: # load model model = BertForQuestionAnswering.from_pretrained(args.output_dir) if args.fp16: model.half() model.to(device) f_result = open(os.path.join(args.output_dir, 'test_results.txt'), "w") # list of test files # testing_files = ['dev.human', 'dev.human.bridge', 'dev.human.comparison'] # for testing_file in testing_files: test_path = args.test_file with open(test_path, 'r', encoding='utf-8') as f: content = f.read().strip().split('\n') eval_dataset = [json.loads(line) for line in content] eval_examples = read_mrqa_examples(input_file=test_path, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Test *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) result, preds, nbest_preds = \ evaluate(args, model, device, eval_dataset, eval_dataloader, eval_examples, eval_features) with open(os.path.join(args.output_dir, 'predictions.txt'), "w") as writer: writer.write(json.dumps(preds, indent=4) + "\n") f_result.write('Evaluation for {}\n'.format(test_path)) for key in sorted(result.keys()): f_result.write("%s = %s\n" % (key, str(result[key]))) f_result.write('\n') f_result.close()
def interact(model, processor, args, label_list, tokenizer, device, fine_tune=True): # TODO ''' 使用topic进行self fine tune Args: model: 模型 processor: 数据读取方法 args: 参数表 label_list: 所有可能类别 tokenizer: 分词方法 device Returns: f1: F1值 ''' # 修改label_list # topic_dict, query_dict = processor.get_interact_examples( args.data_dir, args.use_noisy) # 得到两个词典 predicts, raw_predicts, truths = [], [], [] A, B, C = 0, 0, 0 # 分别对应错误分类, 阈值过高和阈值过低 yes = 0.0 if args.self_fine_tune: logger.info('*************fine-tune!*************') ## self fine-tune ## label_list = [0, 1] if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] # prepare data examples, reverse_topic_dict = processor._create_finetune_examples( topic_dict, args.use_stop_words) train_features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, show_exp=False) num_train_steps = int( len(examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) logger.info("***** Running training *****", len(examples)) logger.info("len of examples = %d", len(examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() best_score = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # label_ids = torch.tensor([f if f<31 else 0 for f in label_ids], dtype=torch.long).to(device) loss = model(input_ids, segment_ids, input_mask, label_ids) # print ('-------------loss:',loss) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() checkpoint = {'state_dict': model.state_dict()} torch.save(checkpoint, args.finetune_save_pth) else: logger.info('*************not fine-tune!*************') ## interaction ## label_list = range(len(topic_dict) + 1) for item in query_dict.items(): # 一句query truths.append(item[1]) # 获得正确的label examples, query, reverse_topic_dict = processor._create_interact_examples( item, topic_dict, args.use_stop_words) # 得到len(topic)个数个InputExample构成list interact_features = convert_examples_to_features( examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in interact_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in interact_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in interact_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in interact_features], dtype=torch.long) interact_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data interact_sampler = SequentialSampler(interact_data) interact_dataloader = DataLoader(interact_data, sampler=interact_sampler, batch_size=args.eval_batch_size) model.eval() predict = np.zeros((0, ), dtype=np.int32) #gt = np.zeros((0,), dtype=np.int32) for input_ids, input_mask, segment_ids, label_ids in interact_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) # (1, 2) #pred = logits[1][1] # 得到匹配分数 TODO pred = np.array(torch.nn.functional.softmax(torch.tensor(logits, \ dtype=torch.float).to(device)[0])[1].cpu()) # 得到预测分数 predict = np.hstack((predict, pred)) #gt = np.hstack((gt, label_ids.cpu().numpy())) # gold target/ query label p_value = np.max(predict) # 最大的数的数值 #print (type(np.where(predict==p_value))) p_label = int(list( np.where(predict == p_value)[0])[0]) # 预测的label,如果发生重复,只取序数最小的 #print (p_value, p_label) if args.use_noisy: raw_predicts.append(p_label + 1) # 保存初始预测 predicts.append(p_label + 1 if p_value > 0.83 else 0) # 注意这里为了序号对应需要+1 else: predicts.append(p_label + 1) raw_predicts.append(p_label + 1) if (predicts[-1] != truths[-1]): if truths[-1] != 0: if raw_predicts[-1] != truths[-1]: # 错误分类 A += 1 print('\nerror type A:错误分类 when encourting:{} while the real topic is :{} \ 置信概率:{:.3f},(g, p)=({},{})\n' .format(query, reverse_topic_dict[truths[-1]],\ p_value, p_label + 1, truths[-1])) # 阈值过高 else: B += 1 print('\nerror type B:阈值过高 when encourting:{} while the real topic is :{} \ 置信概率:{:.3f},(g, p)=({},{})\n' .format(query, reverse_topic_dict[truths[-1]], \ p_value, p_label + 1, truths[-1])) else: # 误分负例,阈值过低 C += 1 print( '\nerror type C:误分负例,阈值过低 when encourting:{} while real tag is negative \ 置信概率:{:.3f},(g, p)=({},{})\n'.format( query, p_value, p_label + 1, truths[-1])) # 误分 else: yes += 1 #logits = logits.detach().cpu().numpy() #label_ids = label_ids.to('cpu').numpy() confuse_mat = [ 1 if p == t and p != 0 else 0 for p, t in zip(predicts, truths) ] # TP pp = 1.0 * sum(confuse_mat) / ( (sum([1 if p > 0 else 0 for p in predicts])) + 1e-10) r = 1.0 * sum(confuse_mat) / (sum([1 if p > 0 else 0 for p in truths]) + 1e-10) f = 2 * pp * r / (pp + r + 1e-10) acc = yes / len(predicts) #f1 = np.mean(metrics.f1_score(predict, gt, average=None)) print( '\rF1 score in text set is {:.3f}; acc is {:.3f}; A,B,C={},{},{} '. format(f, acc, A, B, C), end=''), sys.stdout.flush() return
def train_and_test(data_dir, bert_model="bert-base-uncased", task_name=None, output_dir=None, max_seq_length=80, do_train=False, do_eval=False, do_lower_case=False, train_batch_size=24, eval_batch_size=8, learning_rate=2e-5, num_train_epochs=50, warmup_proportion=0.1, no_cuda=False, local_rank=-1, seed=42, gradient_accumulation_steps=1, optimize_on_cpu=False, fp16=False, loss_scale=128, saved_model=""): # ## Required parameters # parser.add_argument("--data_dir", # default=None, # type=str, # required=True, # help="The input data dir. Should contain the .tsv files (or other data files) for the task.") # parser.add_argument("--bert_model", default=None, type=str, required=True, # help="Bert pre-trained model selected in the list: bert-base-uncased, " # "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") # parser.add_argument("--task_name", # default=None, # type=str, # required=True, # help="The name of the task to train.") # parser.add_argument("--output_dir", # default=None, # type=str, # required=True, # help="The output directory where the model checkpoints will be written.") ## Other parameters # parser.add_argument("--max_seq_length", # default=128, # type=int, # help="The maximum total input sequence length after WordPiece tokenization. \n" # "Sequences longer than this will be truncated, and sequences shorter \n" # "than this will be padded.") # parser.add_argument("--do_train", # default=False, # action='store_true', # help="Whether to run training.") # parser.add_argument("--do_eval", # default=False, # action='store_true', # help="Whether to run eval on the dev set.") # parser.add_argument("--do_lower_case", # default=False, # action='store_true', # help="Set this flag if you are using an uncased model.") # parser.add_argument("--train_batch_size", # default=32, # type=int, # help="Total batch size for training.") # parser.add_argument("--eval_batch_size", # default=8, # type=int, # help="Total batch size for eval.") # parser.add_argument("--learning_rate", # default=5e-5, # type=float, # help="The initial learning rate for Adam.") # parser.add_argument("--num_train_epochs", # default=3.0, # type=float, # help="Total number of training epochs to perform.") # parser.add_argument("--warmup_proportion", # default=0.1, # type=float, # help="Proportion of training to perform linear learning rate warmup for. " # "E.g., 0.1 = 10%% of training.") # parser.add_argument("--no_cuda", # default=False, # action='store_true', # help="Whether not to use CUDA when available") # parser.add_argument("--local_rank", # type=int, # default=-1, # help="local_rank for distributed training on gpus") # parser.add_argument('--seed', # type=int, # default=42, # help="random seed for initialization") # parser.add_argument('--gradient_accumulation_steps', # type=int, # default=1, # help="Number of updates steps to accumulate before performing a backward/update pass.") # parser.add_argument('--optimize_on_cpu', # default=False, # action='store_true', # help="Whether to perform optimization and keep the optimizer averages on CPU") # parser.add_argument('--fp16', # default=False, # action='store_true', # help="Whether to use 16-bit float precision instead of 32-bit") # parser.add_argument('--loss_scale', # type=float, default=128, # help='Loss scaling, positive power of 2 values can improve fp16 convergence.') # args = parser.parse_args() processors = { # "cola": ColaProcessor, # "mnli": MnliProcessor, "mrpc": MrpcProcessor, "stance": StanceProcessor, "neg": NegProcessor } if local_rank == -1 or no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if fp16: logger.info( "16-bits training currently not supported in distributed training" ) fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(local_rank != -1)) if gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(gradient_accumulation_steps)) train_batch_size = int(train_batch_size / gradient_accumulation_steps) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if n_gpu > 0: torch.cuda.manual_seed_all(seed) if not do_train and not do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if do_train: # if os.path.exists(output_dir) and os.listdir(output_dir): if os.path.exists(output_dir): pass # raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir)) else: os.makedirs(output_dir, exist_ok=True) task_name = task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() # tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') train_examples = None num_train_steps = None if do_train: train_df = processor.get_train_df(data_dir) new_train_df = generate_opp_dataset(train_df) new_train_df.to_csv(os.path.join(data_dir, "new_train.tsv"), sep='\t', index=False) train_examples = processor.get_train_examples(data_dir) num_train_steps = int( len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs) # Prepare model # model = BertForSequenceClassification.from_pretrained(bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank), num_labels = 2) model = BertForConsistencyCueClassification.from_pretrained( 'bert-base-uncased', num_labels=2) model.to(device) if fp16: model.half() if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if fp16: param_optimizer = [ (n, param.clone().detach().to('cpu').float().requires_grad_()) for n, param in model.named_parameters() ] elif optimize_on_cpu: param_optimizer = [ (n, param.clone().detach().to('cpu').requires_grad_()) for n, param in model.named_parameters() ] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps # print(t_total) if local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if do_train: optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=t_total) global_step = 0 if do_train: claim_features = convert_claims_to_features(train_examples, label_list, max_seq_length, tokenizer) train_features = convert_pers_to_features(train_examples, label_list, max_seq_length, tokenizer) logger.info("perspective features done") opposite_claim_features = convert_opp_claims_to_features( train_examples, label_list, max_seq_length, tokenizer) logger.info("opposite claim features done") opposite_perspective_features = convert_opp_pers_to_features( train_examples, label_list, max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) logger.info(" Num steps = %d", num_train_steps) pers_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) pers_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) pers_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) pers_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long) claims_input_mask = torch.tensor( [f.input_mask for f in claim_features], dtype=torch.long) claims_segment_ids = torch.tensor( [f.segment_ids for f in claim_features], dtype=torch.long) claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long) opp_pers_input_ids = torch.tensor( [f.input_ids for f in opposite_perspective_features], dtype=torch.long) opp_pers_input_mask = torch.tensor( [f.input_mask for f in opposite_perspective_features], dtype=torch.long) opp_pers_segment_ids = torch.tensor( [f.segment_ids for f in opposite_perspective_features], dtype=torch.long) opp_pers_label_ids = torch.tensor( [f.label_id for f in opposite_perspective_features], dtype=torch.long) # opp_pers_input_ids = torch.tensor([f.input_ids for f in opposite_perspective_features if f.input_ids], dtype=torch.long) # opp_pers_input_mask = torch.tensor([f.input_mask for f in opposite_perspective_features if f.input_mask], dtype=torch.long) # opp_pers_segment_ids = torch.tensor([f.segment_ids for f in opposite_perspective_features if f.segment_ids], dtype=torch.long) # opp_pers_label_ids = torch.tensor([f.label_id for f in opposite_perspective_features if f.label_id], dtype=torch.long) opp_claims_input_ids = torch.tensor( [f.input_ids for f in opposite_claim_features], dtype=torch.long) opp_claims_input_mask = torch.tensor( [f.input_mask for f in opposite_claim_features], dtype=torch.long) opp_claims_segment_ids = torch.tensor( [f.segment_ids for f in opposite_claim_features], dtype=torch.long) opp_claims_label_ids = torch.tensor( [f.label_id for f in opposite_claim_features], dtype=torch.long) # logger.info(" opp pers id: %d, opp pers mask: %d, opp pers seg: %d, opp pers label: %d, opp calims label: %d, calims label: %d ", len(opp_pers_input_ids),len(opp_pers_input_mask),len(opp_pers_segment_ids),len(opp_pers_label_ids),len(opp_claims_label_ids),len(claims_label_ids)) train_data = TensorDataset( pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids, opp_claims_input_ids, opp_claims_input_mask, opp_claims_segment_ids, opp_claims_label_ids) if local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size) model.train() for _ in trange(int(num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 process_bar = tqdm(train_dataloader) for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids, opp_claim_input_ids, opp_claim_input_mask, opp_claim_segment_ids, opp_claim_label_ids = batch out_results = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids, input_ids4=opp_claim_input_ids, token_type_ids4=opp_claim_segment_ids, attention_mask4=opp_claim_input_mask, labels4=opp_claim_label_ids) # loss = model(input_ids, segment_ids, input_mask, label_ids) # print("out_results:") # print(out_results) loss = out_results if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if fp16 and loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * loss_scale if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps process_bar.set_description("Loss: %0.8f" % (loss.sum().item())) loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % gradient_accumulation_steps == 0: if fp16 or optimize_on_cpu: if fp16 and loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) loss_scale = loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 print("\nLoss: {}\n".format(tr_loss / nb_tr_steps)) torch.save( model.state_dict(), output_dir + "fuse_cosloss_1111_2e5_neg_siamese_bert_epoch30.pth") if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0): test_df = processor.get_test_df(data_dir) # new_test_df = generate_opp_dataset(test_df) # new_test_df.to_csv(os.path.join(data_dir, "new_test.tsv"),sep='\t',index=False) train_df = processor.get_train_df(data_dir) # new_train_df = generate_opp_dataset(train_df) # new_train_df.to_csv(os.path.join(data_dir, "new_train.tsv"),sep='\t',index=False) dev_df = processor.get_dev_df(data_dir) # new_dev_df = generate_opp_dataset(dev_df) # new_dev_df.to_csv(os.path.join(data_dir, "new_dev.tsv"),sep='\t',index=False) eval_examples = processor.get_test_examples(data_dir) # eval_examples = processor.get_dev_examples(data_dir) claim_features = convert_claims_to_features(eval_examples, label_list, max_seq_length, tokenizer) eval_features = convert_pers_to_features(eval_examples, label_list, max_seq_length, tokenizer) opposite_claim_features = convert_opp_claims_to_features( eval_examples, label_list, max_seq_length, tokenizer) opposite_eval_features = convert_opp_pers_to_features( eval_examples, label_list, max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", eval_batch_size) pers_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) pers_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) pers_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) pers_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) claims_input_ids = torch.tensor([f.input_ids for f in claim_features], dtype=torch.long) claims_input_mask = torch.tensor( [f.input_mask for f in claim_features], dtype=torch.long) claims_segment_ids = torch.tensor( [f.segment_ids for f in claim_features], dtype=torch.long) claims_label_ids = torch.tensor([f.label_id for f in claim_features], dtype=torch.long) opp_pers_input_ids = torch.tensor( [f.input_ids for f in opposite_eval_features], dtype=torch.long) opp_pers_input_mask = torch.tensor( [f.input_mask for f in opposite_eval_features], dtype=torch.long) opp_pers_segment_ids = torch.tensor( [f.segment_ids for f in opposite_eval_features], dtype=torch.long) opp_pers_label_ids = torch.tensor( [f.label_id for f in opposite_eval_features], dtype=torch.long) opp_claims_input_ids = torch.tensor( [f.input_ids for f in opposite_claim_features], dtype=torch.long) opp_claims_input_mask = torch.tensor( [f.input_mask for f in opposite_claim_features], dtype=torch.long) opp_claims_segment_ids = torch.tensor( [f.segment_ids for f in opposite_claim_features], dtype=torch.long) opp_claims_label_ids = torch.tensor( [f.label_id for f in opposite_claim_features], dtype=torch.long) # logger.info("%d%d%d%d", len(pers_input_ids),len(claims_input_ids),len(opp_pers_input_ids),len(opp_claims_input_ids)) eval_data = TensorDataset(pers_input_ids, pers_input_mask, pers_segment_ids, pers_label_ids, claims_input_ids, claims_input_mask, claims_segment_ids, claims_label_ids, opp_pers_input_ids, opp_pers_input_mask, opp_pers_segment_ids, opp_pers_label_ids, opp_claims_input_ids, opp_claims_input_mask, opp_claims_segment_ids, opp_claims_label_ids) # logger.info(eval_data) # Run prediction for full data # eval_sampler = SequentialSampler(eval_data) eval_sampler = SequentialSampler(eval_data) # logger.info("1") eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) # print('all_input_ids:') # print(all_input_ids) # logger.info("2") # model.load_state_dict(torch.load(saved_model)) model_state_dict = torch.load(saved_model) # logger.info("3") model = BertForConsistencyCueClassification.from_pretrained( 'bert-base-uncased', num_labels=2, state_dict=model_state_dict) # logger.info("4") model.to(device) # logger.info("5") model.eval() # logger.info("6") # eval_loss, eval_accuracy = 0, 0 eval_tp, eval_pred_c, eval_gold_c = 0, 0, 0 eval_loss, eval_accuracy, eval_macro_p, eval_macro_r = 0, 0, 0, 0 raw_score = [] predicted_labels = [] predicted_prob = [] gold_labels = [] nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids, claim_input_ids, claim_input_mask, claim_segment_ids, claim_label_ids, opp_input_ids, opp_input_mask, opp_segment_ids, opp_label_ids, opp_claim_input_ids, opp_claim_input_mask, opp_claim_segment_ids, opp_claim_label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) claim_input_ids = claim_input_ids.to(device) claim_input_mask = claim_input_mask.to(device) claim_segment_ids = claim_segment_ids.to(device) claim_label_ids = claim_label_ids.to(device) opp_input_ids = opp_input_ids.to(device) opp_input_mask = opp_input_mask.to(device) opp_segment_ids = opp_segment_ids.to(device) opp_label_ids = opp_label_ids.to(device) opp_claim_input_ids = opp_claim_input_ids.to(device) opp_claim_input_mask = opp_claim_input_mask.to(device) opp_claim_segment_ids = opp_claim_segment_ids.to(device) opp_claim_label_ids = opp_claim_label_ids.to(device) # print("start") # print(input_ids) # print(input_mask) # print(segment_ids) # print(label_ids) # print(claim_input_ids) # print(claim_input_mask) # print(claim_segment_ids) # print(claim_label_ids) # print("end") with torch.no_grad(): tmp_eval_loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, labels2=claim_label_ids, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, labels3=opp_label_ids, input_ids4=opp_claim_input_ids, token_type_ids4=opp_claim_segment_ids, attention_mask4=opp_claim_input_mask, labels4=opp_claim_label_ids) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, input_ids4=opp_claim_input_ids, token_type_ids4=opp_claim_segment_ids, attention_mask4=opp_claim_input_mask) predicted_prob.extend( torch.nn.functional.softmax(logits, dim=1)) # logits_grid = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, input_ids2=claim_input_ids, token_type_ids2=claim_segment_ids, attention_mask2=claim_input_mask, input_ids3=opp_input_ids, token_type_ids3=opp_segment_ids, attention_mask3=opp_input_mask, input_ids4=opp_claim_input_ids, token_type_ids4=opp_claim_segment_ids, attention_mask4=opp_claim_input_mask) # print(logits) # print(logits[0]) logits = logits.detach().cpu().numpy() # print(logits) label_ids = label_ids.to('cpu').numpy() # print(label_ids) tmp_eval_accuracy = accuracy(logits, label_ids) tmp_predicted = np.argmax(logits, axis=1) predicted_labels.extend(tmp_predicted.tolist()) gold_labels.extend(label_ids.tolist()) # Micro F1 (aggregated tp, fp, fn counts across all examples) tmp_tp, tmp_pred_c, tmp_gold_c = tp_pcount_gcount( logits, label_ids) eval_tp += tmp_tp eval_pred_c += tmp_pred_c eval_gold_c += tmp_gold_c pred_label = np.argmax(logits, axis=1) raw_score += zip(logits, pred_label, label_ids) # Macro F1 (averaged P, R across mini batches) tmp_eval_p, tmp_eval_r, tmp_eval_f1 = p_r_f1(logits, label_ids) eval_macro_p += tmp_eval_p eval_macro_r += tmp_eval_r eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 # Micro F1 (aggregated tp, fp, fn counts across all examples) eval_micro_p = eval_tp / eval_pred_c eval_micro_r = eval_tp / eval_gold_c eval_micro_f1 = 2 * eval_micro_p * eval_micro_r / (eval_micro_p + eval_micro_r) # Macro F1 (averaged P, R across mini batches) eval_macro_p = eval_macro_p / nb_eval_steps eval_macro_r = eval_macro_r / nb_eval_steps eval_macro_f1 = 2 * eval_macro_p * eval_macro_r / (eval_macro_p + eval_macro_r) eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'eval_micro_p': eval_micro_p, 'eval_micro_r': eval_micro_r, 'eval_micro_f1': eval_micro_f1, 'eval_macro_p': eval_macro_p, 'eval_macro_r': eval_macro_r, 'eval_macro_f1': eval_macro_f1, # 'global_step': global_step, # 'loss': tr_loss/nb_tr_steps } output_eval_file = os.path.join( output_dir, "fuse_cosloss_1033033033_2e5_neg_siamese_bert_epoch50_eval_results.txt" ) output_raw_score = os.path.join( output_dir, "fuse_cosloss_1033033033_2e5_neg_siamese_bert_epoch50_raw_score.csv" ) # logger.info(classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4)) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write( classification_report(gold_labels, predicted_labels, target_names=label_list, digits=4)) with open(output_raw_score, 'w') as fout: fields = [ "undermine_score", "support_score", "predict_label", "gold" ] writer = csv.DictWriter(fout, fieldnames=fields) writer.writeheader() for score, pred, gold in raw_score: writer.writerow({ "undermine_score": str(score[0]), "support_score": str(score[1]), "predict_label": str(pred), "gold": str(gold) })
def main(): parser = train_opts() args, _ = parser.parse_known_args() label_list = [ "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "X", "[CLS]", "[SEP]" ] num_labels = len(label_list) + 1 # Load features train_features = pd.read_parquet(os.path.join(args.train_feature_dir, "feature.parquet"), engine='pyarrow') input_ids_list = train_features['input_ids'].tolist() input_mask_list = train_features['input_mask'].tolist() segment_ids_list = train_features['segment_ids'].tolist() label_ids_list = train_features['label_ids'].tolist() all_input_ids = torch.tensor(input_ids_list, dtype=torch.long) all_input_mask = torch.tensor(input_mask_list, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids_list, dtype=torch.long) all_label_ids = torch.tensor(label_ids_list, dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not os.path.exists(args.output_model_dir): os.makedirs(args.output_model_dir) num_train_optimization_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_model_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_model_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump( model_config, open(os.path.join(args.output_model_dir, "model_config.json"), "w")) # Dump data_type.json as a work around until SMT deploys dct = { "Id": "ILearnerDotNet", "Name": "ILearner .NET file", "ShortName": "Model", "Description": "A .NET serialized ILearner", "IsDirectory": False, "Owner": "Microsoft Corporation", "FileExtension": "ilearner", "ContentType": "application/octet-stream", "AllowUpload": False, "AllowPromotion": False, "AllowModelPromotion": True, "AuxiliaryFileExtension": None, "AuxiliaryContentType": None } with open(os.path.join(args.output_model_dir, 'data_type.json'), 'w') as f: json.dump(dct, f) # Dump data.ilearner as a work around until data type design visualization = os.path.join(args.output_model_dir, "data.ilearner") with open(visualization, 'w') as file: file.writelines('{}')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = {"ner":NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) + 1 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) print(train_examples) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels = num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) label_map = {i : label for i, label in enumerate(label_list,1)} model_config = {"bert_model":args.bert_model,"do_lower":args.do_lower_case,"max_seq_length":args.max_seq_length,"num_labels":len(label_list)+1,"label_map":label_map} json.dump(model_config,open(os.path.join(args.output_dir,"model_config.json"),"w")) # Load a trained model and config that you have fine-tuned else: output_config_file = os.path.join(args.output_dir, CONFIG_NAME) output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) config = BertConfig(output_config_file) model = BertForTokenClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file, map_location='cpu')) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] label_map = {i : label for i, label in enumerate(label_list,1)} for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i,mask in enumerate(input_mask): temp_1 = [] temp_2 = [] for j, m in enumerate(mask): if j == 0: continue if m: if label_map[label_ids[i][j]] != "X": temp_1.append(label_map[label_ids[i][j]]) temp_2.append(label_map[logits[i][j]]) else: temp_1.pop() temp_2.pop() break y_true.append(temp_1) y_pred.append(temp_2) report = classification_report(y_true, y_pred,digits=4) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info("\n%s", report) writer.write(report)
def main(): logger = logger_factory(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") n_gpu = torch.cuda.device_count() logger.info(f"Cuda device count:{n_gpu}") device = f"cuda: {config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu'}" seed_everything(seed=config['train']['seed'], device=device) logger.info('starting to load data from disk') torch.cuda.empty_cache() model_state_dict = None processor = MultiLabelTextProcessor(config['data']['data_path']) label_list, num_labels = load_labels(processor) logger.info(f"Labels loaded. Count: {num_labels}") print(label_list) tokenizer = BertTokenizer.from_pretrained( config['bert']['path'], do_lower_case=config['train']['do_lower_case']) train_examples = None num_train_steps = None if config['train']['do_train']: train_examples = processor.get_train_examples( config['data']['data_path'], logger=logger, size=config['train']['train_size']) num_train_steps = int( len(train_examples) / config['train']['train_batch_size'] / config['train']['gradient_accumulation_steps'] * config['train']['num_train_epochs']) logger.info(f"Training examples:{len(train_examples)}") logger.info(f"Training steps:{num_train_steps}") model = get_model(model_state_dict, num_labels) logger.info(f"fp16: {config['train']['fp16']}") if config['train']['fp16']: model.half() model.to(device) logger.info(f"Model loaded: {config['bert']['path']}") # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if config['train']['fp16']: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=config['train']['learning_rate'], bias_correction=False, max_grad_norm=1.0) if config['train']['loss_scale'] == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=config['train']['loss_scale']) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=config['train']['learning_rate'], warmup=config['train']['warmup_proportion'], t_total=t_total) scheduler = CyclicLR(optimizer, base_lr=2e-5, max_lr=5e-5, step_size=2500, last_batch_iteration=0) eval_examples = processor.get_dev_examples( config['data']['data_path'], filename='training.csv', size=config['train']['val_size']) logger.info(f"Evaluation data loaded. Len: {len(eval_examples)}") train_features = convert_examples_to_features( train_examples, label_list, config['train']['max_seq_length'], tokenizer, logger) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", config['train']['train_batch_size']) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=config['train']['train_batch_size']) # Freeze BERT layers for 1 epoch # model.module.freeze_bert_encoder() # fit(1) model.unfreeze_bert_encoder() fit(model, device, n_gpu, optimizer, train_dataloader, logger, t_total, eval_examples, label_list, num_labels, tokenizer) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(config['bert']['cache'], "finetuned_pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) logger.info(f"Model saved! Location: {output_model_file}") if None: # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForMultiLabelSequenceClassification.from_pretrained( config['bert']['path'], num_labels=num_labels, state_dict=model_state_dict) model.to(device) eval(model, device, logger, eval_examples, label_list, num_labels, config['train']['max_seq_length'], tokenizer) result = predict(model, device, config['data']['data_path'], logger, label_list, tokenizer) print(result.shape) result.to_csv(config['data']['data_path'] / 'prediction.csv', index=None)
def train(config, model, train_iter, dev_iter): start_time = time.time() if os.path.exists(config.save_path): model.load_state_dict(torch.load(config.save_path)['model_state_dict']) model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) if os.path.exists(config.save_path): optimizer.load_state_dict( torch.load(config.save_path)['optimizer_state_dict']) total_batch = 0 dev_best_loss = float('inf') dev_last_loss = float('inf') no_improve = 0 flag = False model.train() # plot_model(model, to_file= config.save_dic+'.png') for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward() optimizer.step() if total_batch % 100 == 0: true = labels.data.cpu() predic = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predic) train_loss = loss.item() dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: state = { 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), } dev_best_loss = dev_loss torch.save(state, config.save_dic + str(total_batch) + '.pth') improve = '*' del state else: improve = '' if dev_last_loss > dev_loss: no_improve = 0 elif no_improve % 2 == 0: no_improve += 1 scheduler.step() else: no_improve += 1 dev_last_loss = dev_loss time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}' print( msg.format(total_batch, train_loss, train_acc, dev_loss, dev_acc, time_dif, improve)) model.train() total_batch += 1 if no_improve > config.require_improvement: print("No optimization for a long time, auto-stopping...") flag = True break if flag: break
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--train_file", default=None, type=str, # required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, # required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, # required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() # configuration args.do_train = True args.train_file = "../glue_data/RITS/corpus.txt" args.fp16 = False args.bert_model = "../model/" args.do_lower_case = False args.max_seq_length = 128 args.train_batch_size = 32 args.learning_rate = 3e-5 args.num_train_epochs = 2000.0 args.output_dir = "../model/" if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if not (os.path.exists(args.output_dir) and os.listdir(args.output_dir)): # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir) model_file = os.path.join(args.bert_model, "wiki-ja.model") vocab_file = os.path.join(args.bert_model, "wiki-ja.vocab") if os.path.exists(model_file) and os.path.exists(vocab_file): import tokenization_sentencepiece as tokenization tokenizer = tokenization.FullTokenizer( model_file=model_file, vocab_file=vocab_file, do_lower_case=args.do_lower_case) else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) # train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if os.path.exists(os.path.join(args.bert_model, "pytorch_model.bin")): logger.info("Loading pretrained model from {}".format( os.path.join(args.bert_model, "pytorch_model.bin"))) model = BertForPreTraining.from_pretrained(args.bert_model) else: logger.info( "Create pretrained model from scratch with config {}".format( os.path.join(args.bert_model, "bert_config.json"))) bert_config = BertConfig(vocab_size_or_config_json_file=os.path.join( args.bert_model, "bert_config.json")) model = BertForPreTraining(config=bert_config) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: # TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 masked_lm_accuracy, next_sentence_accuracy = 0, 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 masked_lm_log_probs, next_sentence_log_probs = model( input_ids, segment_ids, input_mask) lm_label_ids = lm_label_ids.detach().cpu().numpy() is_next = is_next.to('cpu').numpy() masked_lm_log_probs = masked_lm_log_probs.detach().cpu().numpy( ) next_sentence_log_probs = next_sentence_log_probs.detach().cpu( ).numpy() tmp_masked_lm_accuracy = masked_lm_accuracy_fn( masked_lm_log_probs, lm_label_ids) tmp_next_sentence_accuracy = next_sentence_accuracy_fn( next_sentence_log_probs, is_next) masked_lm_accuracy += tmp_masked_lm_accuracy next_sentence_accuracy += tmp_next_sentence_accuracy result = { 'epoch': epoch, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'masked_lm_accuracy': masked_lm_accuracy / nb_tr_steps, 'next_sentence_accuracy': next_sentence_accuracy / nb_tr_examples } logger.info("***** Train results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self num_saved = epoch % 3 output_model_file = os.path.join(args.output_dir, f"pytorch_model-{num_saved}.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) # Save the final trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model PYTORCH_PRETRAINED_BERT_CACHE = str(Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', Path.home() / '.pytorch_pretrained_bert'))) model = BertForQuestionAnsweringNew.from_pretrained(args.bert_model, cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # indices = torch.randperm(len(train_data)) # train_indices = indices[:1000] # train_dataloader = DataLoader(train_data, sampler=SubsetRandomSampler(train_indices), batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForQuestionAnsweringNew(config) model.load_state_dict(torch.load(output_model_file)) else: model = BertForQuestionAnsweringNew.from_pretrained(args.bert_model) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_examples = processor.get_train_examples(args.data_dir) cached_train_features_file = os.path.join( args.data_dir, 'train_{0}_{1}_{2}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() ### Example: if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) ### Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) cached_eval_features_file = os.path.join( args.data_dir, 'dev_{0}_{1}_{2}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_eval_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving eval features into cached file %s", cached_eval_features_file) with open(cached_eval_features_file, "wb") as writer: pickle.dump(eval_features, writer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data: TensorDataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler( eval_data) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=LEARNING_RATE, warmup=WARMUP_PROPORTION, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", train_examples_len) logger.info(" Batch size = %d", TRAIN_BATCH_SIZE) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="The name of the model type, gru or transformer.") ## Other parameters parser.add_argument( "--max_src_length", default=400, type=int, help= "The maximum total src sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--max_tgt_length", default=100, type=int, help= "The maximum total tgt sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run train.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval.") parser.add_argument("--do_infer", action='store_true', help="Whether to run eval.") parser.add_argument("--checkpoint", action='store_true', help="Whether to save checkpoint every epoch.") parser.add_argument("--checkpoint_id", default=-1, type=int, help="the checkpoint to eval or infer") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=20, type=int, help="Total batch size for evaling.") parser.add_argument("--infer_batch_size", default=20, type=int, help="Total batch size for infering.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--infer_max_steps", default=20, type=int, help="max step for inference.") parser.add_argument("--infer_min_steps", default=0, type=int, help="min step for inference.") args = parser.parse_args() # data processor processors = { "giga": GigaProcessor, "cnndm": CNNDMProcessor, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_infer: raise ValueError( "At least one of `do_train` or `do_eval` or 'do_infer' must be True." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) # 实例化processor类 processor = processors[task_name]() # 实例化tokenizer src_tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) tgt_tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = Seq2Seq.from_pretrained(args.bert_model, model_type=args.model_type) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) ## do train global_step = 0 nb_tf_steps = 0 tf_loss = 0 if args.do_train: train_features = covert_examples_to_features( examples=train_examples, max_src_length=args.max_src_length, max_tgt_length=args.max_tgt_length, src_tokenizer=src_tokenizer, tgt_tokenizer=tgt_tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_src_ids = torch.tensor([f.src_ids for f in train_features], dtype=torch.long) all_src_mask = torch.tensor([f.src_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_tgt_ids = torch.tensor([f.tgt_ids for f in train_features], dtype=torch.long) all_tgt_mask = torch.tensor([f.tgt_mask for f in train_features], dtype=torch.long) train_data = TensorDataset(all_src_ids, all_src_mask, all_segment_ids, all_tgt_ids, all_tgt_mask) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) batch = batch_sort(batch) src_ids, src_mask, segment_ids, tgt_ids, tgt_mask = batch loss, _, _ = model(src_ids, src_mask, segment_ids, tgt_ids, tgt_mask) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += src_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.checkpoint: model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join( args.output_dir, args.task_name + "_" + args.model_type + "_" + str(epoch) + "_pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) # Save a trained model if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, args.task_name + "_" + args.model_type + "_pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) if args.checkpoint_id == -1: output_model_file = os.path.join( args.output_dir, args.task_name + "_" + args.model_type + "_pytorch_model.bin") else: output_model_file = os.path.join( args.output_dir, args.task_name + "_" + args.model_type + "_" + str(args.checkpoint_id) + "_pytorch_model.bin") # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = Seq2Seq.from_pretrained(args.bert_model, state_dict=model_state_dict, model_type=args.model_type) model.to(device) ## do eval if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = covert_examples_to_features( examples=eval_examples, max_src_length=args.max_src_length, max_tgt_length=args.max_tgt_length, src_tokenizer=src_tokenizer, tgt_tokenizer=tgt_tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_src_ids = torch.tensor([f.src_ids for f in eval_features], dtype=torch.long) all_src_mask = torch.tensor([f.src_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_tgt_ids = torch.tensor([f.tgt_ids for f in eval_features], dtype=torch.long) all_tgt_mask = torch.tensor([f.tgt_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_src_ids, all_src_mask, all_segment_ids, all_tgt_ids, all_tgt_mask) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_rouge = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) batch = batch_sort(batch) src_ids, src_mask, segment_ids, tgt_ids, tgt_mask = batch with torch.no_grad(): tmp_eval_loss, _, _ = model(src_ids, src_mask, segment_ids, tgt_ids, tgt_mask) # print(tmp_eval_loss) tgt_ids = tgt_ids.to('cpu').numpy() tmp_eval_rouge = rouge() eval_loss += tmp_eval_loss.mean().item() eval_rouge += tmp_eval_rouge nb_eval_examples += src_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_rouge = eval_rouge / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_rouge': eval_rouge, 'global_step': global_step, } output_eval_file = os.path.join( args.output_dir, str(args.checkpoint_id) + "_eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ## do infer if args.do_infer and (args.local_rank == -1 or torch.distributed.get_rank() == 0): final_output = [] infer_examples = processor.get_test_examples(args.data_dir) infer_features = covert_examples_to_features( examples=infer_examples, max_src_length=args.max_src_length, max_tgt_length=args.max_tgt_length, src_tokenizer=src_tokenizer, tgt_tokenizer=tgt_tokenizer) logger.info("***** Running inference *****") logger.info(" Num examples = %d", len(infer_examples)) logger.info(" Batch size = %d", args.infer_batch_size) all_src_ids = torch.tensor([f.src_ids for f in infer_features], dtype=torch.long) all_src_mask = torch.tensor([f.src_mask for f in infer_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in infer_features], dtype=torch.long) all_tgt_ids = torch.tensor([f.tgt_ids for f in infer_features], dtype=torch.long) all_tgt_mask = torch.tensor([f.tgt_mask for f in infer_features], dtype=torch.long) infer_data = TensorDataset(all_src_ids, all_src_mask, all_segment_ids, all_tgt_ids, all_tgt_mask) infer_sampler = SequentialSampler(infer_data) infer_dataloader = DataLoader(infer_data, sampler=infer_sampler, batch_size=args.infer_batch_size) model.eval() eval_loss, eval_rouge = 0, 0 eval_loss, eval_rouge = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(infer_dataloader, desc="infering"): batch = tuple(t.to(device) for t in batch) batch = batch_sort(batch) src_ids, src_mask, segment_ids, tgt_ids, tgt_mask = batch with torch.no_grad(): src_ids = src_ids.transpose(0, 1).unsqueeze(2) tgt_ids = tgt_ids.transpose(0, 1).unsqueeze(2) lengths = src_mask.sum(1) max_src_length = src_ids.size(0) max_tgt_length = tgt_ids.size(0) enc_state, memory_bank, lengths = model.encoder( src_ids, lengths) all_decoder_outputs = torch.zeros(args.infer_batch_size, args.infer_max_steps) all_attention_outputs = torch.zeros(args.infer_max_steps, args.infer_batch_size, max_src_length) all_decoder_outputs = all_decoder_outputs.to(device) all_attention_outputs = all_attention_outputs.to(device) model.decoder.init_state(src_ids, memory_bank, enc_state) decoder_input = torch.LongTensor([101] * args.infer_batch_size) decoder_input = decoder_input.to(device) decoder_input = decoder_input.unsqueeze(0) decoder_input = decoder_input.unsqueeze(2) for step in range(args.infer_max_steps): dec_out, dec_attn = model.decoder(decoder_input, memory_bank, memory_lengths=lengths, step=step) logits = model.generator(dec_out) if step + 1 < args.infer_min_steps: for i in range(logits.size(1)): logits[0][i][102] = -1e20 prob, idx = torch.max(logits, 2) decoder_input = idx.unsqueeze(2) all_decoder_outputs[:, step] = idx.squeeze(0) # all_attention_outputs[step, :, :] = dec_attn.squeeze(0) src_ids = src_ids.squeeze(2).transpose(0, 1) tgt_ids = tgt_ids.squeeze(2).transpose(0, 1) src_ids = src_ids.cpu().int().detach().numpy() tgt_ids = tgt_ids.cpu().int().detach().numpy() all_decoder_outputs = all_decoder_outputs.cpu().int().detach( ).numpy() for i in range(args.infer_batch_size): src_text = src_tokenizer.convert_ids_to_tokens(src_ids[i]) tgt_text = tgt_tokenizer.convert_ids_to_tokens( all_decoder_outputs[i]) ref_text = tgt_tokenizer.convert_ids_to_tokens(tgt_ids[i]) final_output.append((tgt_text, ref_text)) # out put infer file output_infer_file = os.path.join( args.output_dir, str(args.checkpoint_id) + "_infer_results.txt") with open(output_infer_file, 'w', encoding='utf8') as wtf: for line1, line2 in final_output: wtf.write(' '.join(line1) + '\t' + ' '.join(line2) + '\n') print('infering end')
def main(bert_model='bert-base-chinese', cache_dir='/tmp/data/', \ max_seq=128, batch_size=32, num_epochs=10, lr=2e-5): processor = Processor() train_examples = processor.get_train_examples('data/hotel') label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) model = BertClassification.from_pretrained(bert_model, \ cache_dir=cache_dir,num_labels=len(label_list)) # model = BertTextCNN.from_pretrained(bert_model,\ # cache_dir=cache_dir,num_labels=len(label_list)) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params':[p for n,p in param_optimizer if not \ any(nd in n for nd in no_decay)],'weight_decay':0.01}, {'params':[p for n,p in param_optimizer if \ any(nd in n for nd in no_decay)],'weight_decay':0.00}] print('train...') num_train_steps = int(len(train_examples) / batch_size * num_epochs) optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.1, t_total=num_train_steps) train_features = convert_examples_to_features(train_examples, label_list, max_seq, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) model.train() for _ in trange(num_epochs, desc='Epoch'): tr_loss = 0 for step, batch in enumerate(tqdm(train_dataloader, desc='Iteration')): input_ids, input_mask, label_ids = tuple( t.to(device) for t in batch) loss = model(input_ids, input_mask, label_ids) loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() print('tr_loss', tr_loss) print('eval...') eval_examples = processor.get_dev_examples('data/hotel') eval_features = convert_examples_to_features(eval_examples, label_list, max_seq, tokenizer) eval_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) eval_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) eval_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(eval_input_ids, eval_input_mask, eval_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size) model.eval() preds = [] for batch in tqdm(eval_dataloader, desc='Evaluating'): input_ids, input_mask, label_ids = tuple(t.to(device) for t in batch) with torch.no_grad(): logits = model(input_ids, input_mask, None) preds.append(logits.detach().cpu().numpy()) preds = np.argmax(np.vstack(preds), axis=1) print(compute_metrics(preds, eval_label_ids.numpy())) torch.save(model, 'data/cache/model')