def prepare_model_and_optimizer(self): # Prepare model self.config = BertConfig.from_json_file(self.args.config_file) # Padding for divisibility by 8 if self.config.vocab_size % 8 != 0: self.config.vocab_size += 8 - (self.config.vocab_size % 8) self.model = BertForPreTraining(self.config) self.another_model = BertForPreTraining(self.config) self.model.to(self.device) self.another_model.to(self.device) param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [] names = [] for n, p in param_optimizer: if not any(nd in n for nd in no_decay): optimizer_grouped_parameters.append({ 'params': [p], 'weight_decay': 0.01, 'name': n }) names.append({'params': [n], 'weight_decay': 0.01}) if any(nd in n for nd in no_decay): optimizer_grouped_parameters.append({ 'params': [p], 'weight_decay': 0.00, 'name': n }) names.append({'params': [n], 'weight_decay': 0.00}) if self.args.phase2: max_steps = self.args.max_steps tmp = max_steps * 10 r = self.args.phase1_end_step / tmp lr = self.args.learning_rate * (1 - r) else: max_steps = int(self.args.max_steps / 9 * 10) lr = self.args.learning_rate if self.args.optimizer == "lamb": self.optimizer = BertLAMB(optimizer_grouped_parameters, lr=lr, warmup=self.args.warmup_proportion if not self.args.phase2 else -1, t_total=max_steps) elif self.args.optimizer == "adam": self.optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=self.args.warmup_proportion if not self.args.phase2 else -1, t_total=max_steps)
def test_adam(self): w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) target = torch.tensor([0.4, 0.2, -0.5]) criterion = torch.nn.MSELoss() # No warmup, constant schedule, no gradient clipping optimizer = BertAdam(params=[w], lr=2e-1, weight_decay=0.0, max_grad_norm=-1) for _ in range(100): loss = criterion(w, target) loss.backward() optimizer.step() w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves. w.grad.zero_() self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
def set_optimizer(self, total_step, use_bert=True): if use_bert: self.optimizer = BertAdam(params=self.parameters(), lr=self.config['lr'], warmup=0.1, t_total=total_step) else: self.optimizer = Adam(self.parameters(), lr=self.config['lr'])
def set_optimizer(self, total_step, use_bert=False): if use_bert: self.set_optimizer = BertAdam(params=self.parameters(), lr=self.lr, warmup=0.1, t_total=total_step) else: self.set_optimizer = Adam(self.parameters(), lr=self.lr)
def tpu_training_loop(model, loader, device, context): """ Called by torch_xla_py.data_parallel. This function is executed on each core of the TPU once per epoch""" model.zero_grad() no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = context.getattr_or( 'optimizer', BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps)) tr_loss = None pbar = None if str(pbar_device) == str(device): pbar = tqdm(total=int(pbar_steps), desc=f"training", dynamic_ncols=True) tracker = tpu_xm.RateTracker() model.train() for step, batch in enumerate(loader): input_ids, input_mask, segment_ids, label_ids, pos_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids, pos_ids=pos_ids) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tracker.add(args.train_batch_size) tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps if pbar is not None: pbar.update(1) tpu_xm.optimizer_step(optimizer) # optimizer.step() optimizer.zero_grad() return tr_loss.item() / step
def getOptimizer(args, model, train_steps): param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] p0 = [p for n, p in param_optimizer if any(nd in n for nd in no_decay)] p1 = [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)] optimizer_grouped_parameters = [{ 'params': p1, 'weight_decay': 0.01 }, { 'params': p0, 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=train_steps) return optimizer
def prep_optimizer(args, model, num_train_optimization_steps): param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) return optimizer
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining") ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1.0, type=float, help="Total number of training steps to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_swag_examples(os.path.join( args.data_dir, 'train.csv'), is_training=True) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)), num_choices=4) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): # Terminate early for benchmarking if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForMultipleChoice(config, num_choices=4) # noinspection PyUnresolvedReferences model.load_state_dict(torch.load(output_model_file)) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=4) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'), strict=False) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_swag_examples(os.path.join( args.data_dir, 'val.csv'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples # noinspection PyUnboundLocalVariable result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default='pretrained/bert-base-uncased', type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default='tasks/QuestionAnswering/squad_output', type=str, required=False, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument( "--train_file", default='tasks/QuestionAnswering/squad_data/train-v1.1.json', type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default='tasks/QuestionAnswering/squad_data/dev-v1.1.json', type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument("--vocab_size", default=30522, type=int, help="The size of vocabulary.") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument( "--do_train", default=1, type=int, help="Whether to run training.") # , action='store_true' parser.add_argument( "--do_predict", default=1, type=int, help="Whether to run eval on the dev set.") # , action='store_true' parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=2.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to squad_data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=83, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=1, type=int, # action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) args.train_batch_size = int(args.train_batch_size) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_data, dev_data = load_dataset(args) # Prepare model config = json.load(open(os.path.join(args.bert_model, BERT_CONFIG), "r")) model = BertQA(args.vocab_size, **config) model.load(os.path.join(args.bert_model, MODEL_NAME)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.a_2', 'LayerNorm.b_2'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = args.num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: criterion = nn.CrossEntropyLoss() if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch logits = model(x=input_ids, segment_info=segment_ids, mask=input_mask) logits_start = logits['pred_start'] logtis_end = logits['pred_end'] ignored_index = logits_start.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss = (criterion(logits_start, start_positions) + criterion(logtis_end, end_positions)) / 2 if n_gpu > 1: loss = loss.mean() loss.backward() if (step + 1) % 1 == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model.load(output_model_file) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) # Run prediction for full squad_data eval_sampler = SequentialSampler(dev_data) eval_dataloader = DataLoader(dev_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): logits = model(x=input_ids, segment_info=segment_ids, mask=input_mask) batch_start_logits = logits['pred_start'] batch_end_logits = logits['pred_end'] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default="../bert_pytorch/tasks/MultipleChoice/swag_data/", type=str, required=False, help= "The input squad_data dir. Should contain the .csv files (or other squad_data files) for the task." ) parser.add_argument( "--bert_model", default='converted/base-uncased', type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default='tasks/MultipleChoice/swag_output/', type=str, required=False, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=80, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--vocab_size", default=30522, type=int, help="The size of vocabulary.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=4, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() ###### config setting ###### if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) ###### fastNLP.DataSet loading ###### train_data, dev_data = load_dataset(args) ###### model initializing ###### config = json.load(open(os.path.join(args.bert_model, BERT_CONFIG), "r")) model = BertMC(args.vocab_size, num_choices=4, **config) model.load(os.path.join(args.bert_model, MODEL_NAME)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) ###### ptimizer initializing ###### param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.a_2', 'LayerNorm.b_2'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = args.num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: criterion = nn.CrossEntropyLoss() train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(x=input_ids, segment_info=segment_ids, mask=input_mask)['pred'] loss = criterion(logits, label_ids) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model.load(output_model_file) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_dataloader = DataLoader(dev_data, sampler=SequentialSampler(dev_data), batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): # TODO logits = model(x=input_ids, segment_info=segment_ids, mask=input_mask)['pred'] tmp_eval_loss = criterion(logits, label_ids) if n_gpu > 1: tmp_eval_loss = tmp_eval_loss.mean() logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='./data', type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default='bert-base-uncased', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--pre_training_path", default='./pre_training', type=str, help="model pre training") parser.add_argument("--save_path", default='./output', type=str, help="model save path") parser.add_argument("--ngpu", default=1, type=int, help="use gpu number") parser.add_argument("--load_model", default=False, action='store_true', help="model load") parser.add_argument("--save_model", default=False, action='store_true', help="model save ") parser.add_argument("--load_path", default='./output', type=str, help="model save path") parser.add_argument("--is_test", default='./output', type=str, help="model save path") parser.add_argument("--task_name", default='cloth', type=str, help="The name of the task to train.") parser.add_argument("--output_dir", default='EXP/', type=str, required=True, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--cache_size", default=256, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--num_log_steps", default=10, type=int, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') args = parser.parse_args() if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") suffix = time.strftime('%Y%m%d-%H%M%S') args.output_dir = os.path.join(args.output_dir, suffix) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) bert_list = [] model_list = [] for m in args.bert_model.split('+'): bert_list .append(m) model_list.append(chose_model_model(m, args)) logging = get_logger(os.path.join(args.output_dir, 'log.txt')) data_file = [] for m in bert_list: data_file.append({'train': 'train', 'valid': 'dev', 'test': 'test'}) for key in data_file[-1].keys(): data_file[-1][key] = data_file[-1][key] + '-' + m + '.pt' if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logging("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logging("device {} n_gpu {} distributed training {}".format(device, n_gpu, bool(args.local_rank != -1))) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) task_name = args.task_name.lower() num_train_steps = [] train_data = [] if args.do_train: for id, m in enumerate(bert_list): train_data.append(data_utils.Loader(args.data_dir, data_file[id]['train'], args.cache_size, args.train_batch_size, device)) num_train_steps.append(int( train_data[-1].data_num / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs)) # Prepare model # model = RobertaForCloze.from_pretrained("roberta-base", # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), # #proxies={ "socks":"127.0.0.1:1080",} # # tokenizer = chose_model_token(args.bert_model,args) # tokenizer = chose_model_token(args.bert_model,args) # model.resize_token_embeddings(len(tokenizer)) # model = torch.load() if args.fp16: for id, model in enumerate(model_list): model_list[id].half() for id, model in enumerate(model_list): model_list[id].to(device) if args.local_rank != -1: for id, model in enumerate(model_list): model_list[id] = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: for id, model in enumerate(model_list): model_list[id] = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = [] if args.fp16: for model in model_list: param_optimizer.append((n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()) elif args.optimize_on_cpu: for model in model_list: param_optimizer.append((n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()) else: for model in model_list: param_optimizer.append(list(model.named_parameters())) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [] for p_o in param_optimizer: optimizer_grouped_parameters.append([ {'params': [p for n, p in p_o if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in p_o if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ]) global_step = 0 if args.load_model: if args.ngpu > 1: for id, m in enumerate(args.bert_list): print(' model is loading...... PATH:' + m + '/' + m + '_' + str( args.ngpu) + '.bin') model_list[id] = torch.load(args.load_path + '/' + m + '_' + str(args.ngpu) + '.bin') else: for id, m in enumerate(args.bert_list): print(' model is loading...... PATH:' + args.load_path + '/' + m + '.bin') model_list[id] = torch.load(args.load_path + '/' + m + '.bin') if args.do_train: # import time for id, model in enumerate(model_list): start = time.time() logging("***** Running training *****") logging(" Batch size = {}".format(args.train_batch_size)) logging(" Num steps = {}".format(num_train_steps[id])) model.train() loss_history = [] acc_history = [] t_total = num_train_steps[id] if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = (BertAdam(optimizer_grouped_parameters[id], lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total)) for _ in range(int(args.num_train_epochs)): tr_loss = 0 tr_acc = 0 nb_tr_examples, nb_tr_steps = 0, 0 for inp, tgt in train_data[id].data_iter(): loss, acc = model(inp, tgt) # print(loss) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. acc = acc.sum() if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() # print(loss.shape) tr_loss += loss.item() tr_acc += acc.item() # print(tr_acc) nb_tr_examples += inp[-1].sum() nb_tr_steps += 1 if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad(param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logging("FP16 TRAINING: Nan in gradients, reducing loss scaling") args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model(model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if (global_step % args.num_log_steps == 0): logging('step: {} | train loss: {} | train acc {}'.format( global_step, tr_loss / nb_tr_examples, tr_acc / nb_tr_examples)) loss_history.append([global_step, tr_loss]) acc_history.append([global_step, tr_acc]) tr_loss = 0 tr_acc = 0 nb_tr_examples = 0 save_history_path = "./Cord_Pic" end = time.time() print(end - start) loss_history = np.array(loss_history) acc_history = np.array(acc_history) np.save(save_history_path + '/' + bert_list[id] + '.loss_history.npy', loss_history) # 保存为.npy格式 np.save(save_history_path + '/' + bert_list[id] + '.acc_history.npy', acc_history) # 保存为.npy格式 # 读取 # a = np.load('a.npy') # a = a.tolist() if args.save_model: if args.ngpu > 1: print(' model is saving...... PATH:' + args.load_path + '/' + bert_list[id] + '_' + str( args.ngpu) + '.bin') torch.save(model, args.load_path + '/' + bert_list[id] + '_' + str(args.ngpu) + '.bin') else: print(' model is saving...... PATH:' + args.load_path + '/' + bert_list[id] + '.bin') torch.save(model, args.load_path + '/' + bert_list[id] + '.bin') if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logging("***** Running evaluation *****") logging(" Batch size = {}".format(args.eval_batch_size)) valid_data = [] for id, m in enumerate(bert_list): valid_data.append(data_utils.Loader(args.data_dir, data_file[id]['valid'], args.cache_size, args.eval_batch_size, device)) # Run prediction for full data for id, model in enumerate(model_list): out = [] for inp, tgt in valid_data[id].data_iter(shuffle=False): with torch.no_grad(): one_out = model(inp, tgt) out.append(one_out) output = torch.tensor([valid_data[id].data_num, 4]) for batch in range(int(valid_data[id].data_num / args.eval_batch_size)): output[batch * args.eval_batch_size : (batch + 1) * args.eval_batch_size] = out[batch] torch.save(output,bert_list[id] + '_out.pt')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--word_embedding_file", default='emb/crawl-300d-2M.vec', type=str, help="The input directory of word embeddings.") parser.add_argument("--index_path", default='emb/p_index.bin', type=str, help="The input directory of word embedding index.") parser.add_argument("--word_embedding_info", default='emb/vocab_info.txt', type=str, help="The input directory of word embedding info.") parser.add_argument("--data_file", default='', type=str, help="The input directory of input data file.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--max_ngram_length", default=16, type=int, help="The maximum total ngram sequence") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--embedding_size", default=300, type=int, help="Total batch size for embeddings.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--num_eval_epochs', type=int, default=0, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--single', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) logger.info("loading embeddings ... ") if args.do_train: emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors( args.word_embedding_file) write_vocab_info(args.word_embedding_info, emb_vocab_size, vocab_list) if args.do_eval: emb_vocab_size, vocab_list = load_vocab_info(args.word_embedding_info) #emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors(args.word_embedding_file) #write_vocab_info(args.word_embedding_info, emb_vocab_size, vocab_list) logger.info("loading p index ...") if not os.path.exists(args.index_path): p = load_embeddings_and_save_index(range(emb_vocab_size), emb_vec, args.index_path) else: p = load_embedding_index(args.index_path, emb_vocab_size, num_dim=args.embedding_size) train_examples = None num_train_optimization_steps = None w2i, i2w, vocab_size = {}, {}, 1 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) train_features, w2i, i2w, vocab_size = convert_examples_to_features_gnrt_train(\ train_examples, label_list, args.max_seq_length, args.max_ngram_length, tokenizer, emb_dict) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num token vocab = %d", vocab_size) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_ngram_ids = torch.tensor([f.ngram_ids for f in train_features], dtype=torch.long) all_ngram_labels = torch.tensor( [f.ngram_labels for f in train_features], dtype=torch.long) all_ngram_masks = torch.tensor([f.ngram_masks for f in train_features], dtype=torch.long) all_ngram_embeddings = torch.tensor( [f.ngram_embeddings for f in train_features], dtype=torch.float) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForNgramClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels, embedding_size=args.embedding_size, max_seq_length=args.max_seq_length, max_ngram_length=args.max_ngram_length) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 #if args.do_train: train_data = TensorDataset(all_ngram_ids, all_ngram_labels, all_ngram_masks, all_ngram_embeddings) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ind in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) ngram_ids, ngram_labels, ngram_masks, ngram_embeddings = batch loss = model(ngram_ids, ngram_masks, ngram_embeddings) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'loss': loss, } output_eval_file = os.path.join(args.output_dir, "train_results.txt") with open(output_eval_file, "a") as writer: #logger.info("***** Training results *****") writer.write("epoch" + str(ind) + '\n') for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('\n') model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "epoch" + str(ind) + WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_gnrt_dev_examples(args.data_file) eval_features, w2i, i2w, vocab_size = convert_examples_to_features_gnrt_eval( eval_examples, label_list, args.max_seq_length, args.max_ngram_length, tokenizer, w2i, i2w, vocab_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Num token vocab = %d", vocab_size) logger.info(" Batch size = %d", args.eval_batch_size) all_token_ids = torch.tensor([f.token_ids for f in eval_features], dtype=torch.long) # all_flaw_labels: indexes of wrong words predicted by disc all_flaw_labels = torch.tensor([f.flaw_labels for f in eval_features], dtype=torch.long) all_ngram_ids = torch.tensor([f.ngram_ids for f in eval_features], dtype=torch.long) all_ngram_mask = torch.tensor([f.ngram_mask for f in eval_features], dtype=torch.long) all_ngram_labels = torch.tensor( [f.ngram_labels for f in eval_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_token_ids, all_ngram_ids, all_ngram_mask, all_ngram_labels, all_label_id, all_flaw_labels) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if args.single: eval_range = trange(int(args.num_eval_epochs), int(args.num_eval_epochs + 1), desc="Epoch") else: eval_range = trange(int(args.num_eval_epochs), desc="Epoch") for epoch in eval_range: output_file = os.path.join( args.data_dir, "epoch" + str(epoch) + "gnrt_outputs.tsv") with open(output_file, "w") as csv_file: writer = csv.writer(csv_file, delimiter='\t') writer.writerow(["sentence", "label"]) output_model_file = os.path.join( args.output_dir, "epoch" + str(epoch) + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) config = BertConfig(output_config_file) model = BertForNgramClassification( config, num_labels=num_labels, embedding_size=args.embedding_size, max_seq_length=args.max_seq_length, max_ngram_length=args.max_ngram_length) model.load_state_dict(torch.load(output_model_file)) model.to(device) model.eval() for token_ids, ngram_ids, ngram_mask, ngram_labels, label_id, flaw_labels in tqdm( eval_dataloader, desc="Evaluating"): ngram_ids = ngram_ids.to(device) ngram_mask = ngram_mask.to(device) with torch.no_grad(): logits = model(ngram_ids, ngram_mask) logits = logits.detach().cpu().numpy() flaw_labels = flaw_labels.to('cpu').numpy() label_id = label_id.to('cpu').numpy() token_ids = token_ids.to('cpu').numpy() masks = ngram_mask.to('cpu').numpy() with open(output_file, "a") as csv_file: for i in range(len(label_id)): correct_tokens = look_up_words(logits[i], masks[i], vocab_list, p) token_new = replace_token(token_ids[i], flaw_labels[i], correct_tokens, i2w) token_new = ' '.join(token_new) label = str(label_id[i]) writer = csv.writer(csv_file, delimiter='\t') writer.writerow([token_new, label])
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument("--dev_src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--dev_tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument("--ks_src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--ks_tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument("--ks_dev_src_file", default=None, type=str, help="The input data file name.") parser.add_argument("--ks_dev_tgt_file", default=None, type=str, help="The output data file name.") parser.add_argument("--predict_input_file", default=None, type=str, help="predict_input_file") parser.add_argument("--predict_output_file", default=None, type=str, help="predict_output_file") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--log_dir", default='', type=str, required=True, help="The output directory where the log will be written.") parser.add_argument("--model_recover_path", default=None, type=str, required=True, help="The file of fine-tuned pretraining model.") parser.add_argument("--optim_recover_path", default=None, type=str, help="The file of pretraining optimizer.") parser.add_argument("--predict_bleu", default=0.5, type=float, help="The Predicted Bleu for KS Predict ") parser.add_argument("--train_vae", action='store_true', help="Whether to train vae.") # Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run ks predict.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--label_smoothing", default=0, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="The weight decay rate for Adam.") parser.add_argument("--finetune_decay", action='store_true', help="Weight decay to the original weights.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--hidden_dropout_prob", default=0.1, type=float, help="Dropout rate for hidden states.") parser.add_argument("--attention_probs_dropout_prob", default=0.1, type=float, help="Dropout rate for attention probabilities.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--fp32_embedding', action='store_true', help="Whether to use 32-bit float precision instead of 16-bit for embeddings") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument('--from_scratch', action='store_true', help="Initialize parameters with random values (i.e., training from scratch).") parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--new_pos_ids', action='store_true', help="Use new position ids for LMs.") parser.add_argument('--tokenized_input', action='store_true', help="Whether the input is tokenized.") parser.add_argument('--max_len_a', type=int, default=0, help="Truncate_config: maximum length of segment A.") parser.add_argument('--max_len_b', type=int, default=0, help="Truncate_config: maximum length of segment B.") parser.add_argument('--trunc_seg', default='', help="Truncate_config: first truncate segment A/B (option: a, b).") parser.add_argument('--always_truncate_tail', action='store_true', help="Truncate_config: Whether we should always truncate tail.") parser.add_argument("--mask_prob", default=0.15, type=float, help="Number of prediction is sometimes less than max_pred when sequence is short.") parser.add_argument("--mask_prob_eos", default=0, type=float, help="Number of prediction is sometimes less than max_pred when sequence is short.") parser.add_argument('--max_pred', type=int, default=20, help="Max tokens of prediction.") parser.add_argument("--num_workers", default=0, type=int, help="Number of workers for the data loader.") parser.add_argument('--mask_source_words', action='store_true', help="Whether to mask source words for training") parser.add_argument('--skipgram_prb', type=float, default=0.0, help='prob of ngram mask') parser.add_argument('--skipgram_size', type=int, default=1, help='the max size of ngram mask') parser.add_argument('--mask_whole_word', action='store_true', help="Whether masking a whole word.") parser.add_argument('--do_l2r_training', action='store_true', help="Whether to do left to right training") parser.add_argument('--has_sentence_oracle', action='store_true', help="Whether to have sentence level oracle for training. " "Only useful for summary generation") parser.add_argument('--max_position_embeddings', type=int, default=None, help="max position embeddings") parser.add_argument('--relax_projection', action='store_true', help="Use different projection layers for tasks.") parser.add_argument('--ffn_type', default=0, type=int, help="0: default mlp; 1: W((Wx+b) elem_prod x);") parser.add_argument('--num_qkv', default=0, type=int, help="Number of different <Q,K,V>.") parser.add_argument('--seg_emb', action='store_true', help="Using segment embedding for self-attention.") parser.add_argument('--s2s_special_token', action='store_true', help="New special tokens ([S2S_SEP]/[S2S_CLS]) of S2S.") parser.add_argument('--s2s_add_segment', action='store_true', help="Additional segmental for the encoder of S2S.") parser.add_argument('--s2s_share_segment', action='store_true', help="Sharing segment embeddings for the encoder of S2S (used with --s2s_add_segment).") parser.add_argument('--pos_shift', action='store_true', help="Using position shift for fine-tuning.") args = parser.parse_args() assert Path(args.model_recover_path).exists( ), "--model_recover_path doesn't exist" args.output_dir = args.output_dir.replace( '[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) args.log_dir = args.log_dir.replace( '[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True) handler = logging.FileHandler(os.path.join(args.log_dir, "train.log"), encoding='UTF-8') handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) console = logging.StreamHandler() console.setLevel(logging.DEBUG) logger.addHandler(handler) logger.addHandler(console) json.dump(args.__dict__, open(os.path.join( args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs dist.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int( args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) if args.max_position_embeddings: tokenizer.max_len = args.max_position_embeddings data_tokenizer = WhitespaceTokenizer() if args.tokenized_input else tokenizer if args.local_rank == 0: dist.barrier() print("Loading QKR Train Dataset", args.data_dir) bi_uni_pipeline = [seq2seq_loader.Preprocess4Seq2seq(args.max_pred, args.mask_prob, list(tokenizer.vocab.keys( )), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail}, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift)] file_oracle = None if args.has_sentence_oracle: file_oracle = os.path.join(args.data_dir, 'train.oracle') fn_src = os.path.join( args.data_dir, args.src_file if args.src_file else 'train.src') fn_tgt = os.path.join( args.data_dir, args.tgt_file if args.tgt_file else 'train.tgt') train_dataset = seq2seq_loader.Seq2SeqDataset( fn_src, fn_tgt, args.train_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset, replacement=False) _batch_size = args.train_batch_size else: train_sampler = DistributedSampler(train_dataset) _batch_size = args.train_batch_size // dist.get_world_size() train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=_batch_size, sampler=train_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) print("Loading KS Train Dataset", args.data_dir) ks_fn_src = os.path.join( args.data_dir, args.ks_src_file) ks_fn_tgt = os.path.join( args.data_dir, args.ks_tgt_file) ks_train_dataset = seq2seq_loader.Seq2SeqDataset( ks_fn_src, ks_fn_tgt, args.train_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: ks_train_sampler = RandomSampler(ks_train_dataset, replacement=False) _batch_size = args.train_batch_size else: ks_train_sampler = DistributedSampler(ks_train_dataset) _batch_size = args.train_batch_size // dist.get_world_size() ks_train_dataloader = torch.utils.data.DataLoader(ks_train_dataset, batch_size=_batch_size, sampler=ks_train_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) logger.info("Loading QKR Eval Dataset from {}".format(args.data_dir)) fn_src = os.path.join( args.data_dir, args.dev_src_file) fn_tgt = os.path.join( args.data_dir, args.dev_tgt_file) dev_reddit_dataset = seq2seq_loader.Seq2SeqDataset( fn_src, fn_tgt, args.eval_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: dev_reddit_sampler = RandomSampler(dev_reddit_dataset, replacement=False) _batch_size = args.eval_batch_size else: dev_reddit_sampler = DistributedSampler(dev_reddit_dataset) _batch_size = args.eval_batch_size // dist.get_world_size() dev_reddit_dataloader = torch.utils.data.DataLoader(dev_reddit_dataset, batch_size=_batch_size, sampler=dev_reddit_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) logger.info("Loading KS Eval Dataset from {}".format(args.data_dir)) ks_dev_fn_src = os.path.join( args.data_dir, args.ks_dev_src_file) ks_dev_fn_tgt = os.path.join( args.data_dir, args.ks_dev_tgt_file) ks_dev_reddit_dataset = seq2seq_loader.Seq2SeqDataset( ks_dev_fn_src, ks_dev_fn_tgt, args.eval_batch_size, data_tokenizer, args.max_seq_length, file_oracle=file_oracle, bi_uni_pipeline=bi_uni_pipeline) if args.local_rank == -1: ks_dev_reddit_sampler = RandomSampler(ks_dev_reddit_dataset, replacement=False) _batch_size = args.eval_batch_size else: ks_dev_reddit_sampler = DistributedSampler(ks_dev_reddit_dataset) _batch_size = args.eval_batch_size // dist.get_world_size() ks_dev_reddit_dataloader = torch.utils.data.DataLoader(ks_dev_reddit_dataset, batch_size=_batch_size, sampler=ks_dev_reddit_sampler, num_workers=args.num_workers, collate_fn=seq2seq_loader.batch_list_to_batch_tensors, pin_memory=False) # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) # t_total = int(math.ceil(len(train_dataset.ex_list) / args.train_batch_size) t_total = int(len(train_dataloader) * args.num_train_epochs / args.gradient_accumulation_steps) amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model recover_step = _get_max_epoch_model(args.output_dir) cls_num_labels = 2 type_vocab_size = 6 + \ (1 if args.s2s_add_segment else 0) if args.new_segment_ids else 2 num_sentlvl_labels = 2 if args.has_sentence_oracle else 0 relax_projection = 4 if args.relax_projection else 0 if args.local_rank not in (-1, 0): # Make sure only the first process in distributed training will download model & vocab dist.barrier() if (recover_step is None) and (args.model_recover_path is None): # if _state_dict == {}, the parameters are randomly initialized # if _state_dict == None, the parameters are initialized with bert-init _state_dict = {} if args.from_scratch else None model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=_state_dict, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb) global_step = 0 else: if recover_step: logger.info("***** Recover model: %d *****", recover_step) model_recover = torch.load(os.path.join( args.output_dir, "model.{0}.bin".format(recover_step)), map_location='cpu') # recover_step == number of epochs global_step = math.floor( recover_step * t_total / args.num_train_epochs) elif args.model_recover_path: logger.info("***** Recover model: %s *****", args.model_recover_path) model_recover = torch.load( args.model_recover_path, map_location='cpu') global_step = 0 model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, num_rel=0, type_vocab_size=type_vocab_size, config_path=args.config_path, task_idx=3, num_sentlvl_labels=num_sentlvl_labels, max_position_embeddings=args.max_position_embeddings, label_smoothing=args.label_smoothing, fp32_embedding=args.fp32_embedding, relax_projection=relax_projection, new_pos_ids=args.new_pos_ids, ffn_type=args.ffn_type, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, num_qkv=args.num_qkv, seg_emb=args.seg_emb) if args.local_rank == 0: dist.barrier() if args.fp16: model.half() if args.fp32_embedding: model.bert.embeddings.word_embeddings.float() model.bert.embeddings.position_embeddings.float() model.bert.embeddings.token_type_embeddings.float() model.to(device) if args.local_rank != -1: try: from torch.nn.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("DistributedDataParallel") model = DDP(model, device_ids=[ args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: # model = torch.nn.DataParallel(model) model = DataParallelImbalance(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from optimization_fp16 import FP16_Optimizer_State from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer_State( optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer_State( optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if recover_step: logger.info("***** Recover optimizer: %d *****", recover_step) optim_recover = torch.load(os.path.join( args.output_dir, "optim.{0}.bin".format(recover_step)), map_location='cpu') if hasattr(optim_recover, 'state_dict'): optim_recover = optim_recover.state_dict() optimizer.load_state_dict(optim_recover) if args.loss_scale == 0: logger.info("***** Recover optimizer: dynamic_loss_scale *****") optimizer.dynamic_loss_scale = True logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_train: KL_weight = 0.0 logger.info("***** Running training *****") logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) model.train() if recover_step: start_epoch = recover_step+1 else: start_epoch = 1 for i_epoch in trange(start_epoch, int(args.num_train_epochs)+1, desc="Epoch", disable=args.local_rank not in (-1, 0)): if args.local_rank != -1: train_sampler.set_epoch(i_epoch) step = 0 for batch, ks_batch in zip(train_dataloader,ks_train_dataloader): batch = [ t.to(device) if t is not None else None for t in batch] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx,labels, ks_labels = batch oracle_pos, oracle_weights, oracle_labels = None, None, None loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv,labels=labels,ks_labels=ks_labels,train_vae=args.train_vae) if args.train_vae: masked_lm_loss, next_sentence_loss, KL_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() KL_loss = KL_loss.mean() else: masked_lm_loss, next_sentence_loss, _ = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() KL_weight += 1.0 / float(len(ks_train_dataloader)) if args.train_vae: loss = masked_lm_loss + next_sentence_loss + KL_weight * KL_loss else: loss = masked_lm_loss + next_sentence_loss logger.info("In{}step, masked_lm_loss:{}".format(step, masked_lm_loss)) logger.info("In{}step, KL_weight:{}".format(step, KL_weight)) #logger.info("In{}step, KL_loss:{}".format(step, KL_loss)) logger.info("******************************************* ") # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step/t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if random.randint(0,0) == 0: ks_batch = [ t.to(device) if t is not None else None for t in ks_batch] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, labels, ks_labels = ks_batch oracle_pos, oracle_weights, oracle_labels = None, None, None loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=labels, ks_labels=ks_labels,train_ks=True,train_vae=args.train_vae) if args.train_vae: ks_loss, KS_KL_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. ks_loss = ks_loss.mean() KS_KL_loss = KS_KL_loss.mean() loss = ks_loss + KL_weight * KS_KL_loss else: ks_loss, _ = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. ks_loss = ks_loss.mean() loss = ks_loss logger.info("In{}step, ks_loss:{}".format(step, ks_loss)) #logger.info("In{}step, KS_KL_loss:{}".format(step, KS_KL_loss)) logger.info("******************************************* ") # ensure that accumlated gradients are normalized if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: lr_this_step = args.learning_rate * \ warmup_linear(global_step / t_total, args.warmup_proportion) if args.fp16: # modify learning rate with special warm up BERT uses for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 step += 1 if (step + 1) % 200 == 0: logger.info("***** Running QKR evaling *****") logger.info(" Batch size = %d", args.eval_batch_size) if args.local_rank != -1: train_sampler.set_epoch(i_epoch) dev_iter_bar = tqdm(dev_reddit_dataloader, desc='Iter (loss=X.XXX)', disable=args.local_rank not in (-1, 0)) total_lm_loss = 0 total_kl_loss = 0 for qkr_dev_step, batch in enumerate(dev_iter_bar): batch = [ t.to(device) if t is not None else None for t in batch] if args.has_sentence_oracle: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, oracle_pos, oracle_weights, oracle_labels = batch else: input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, labels, ks_labels = batch oracle_pos, oracle_weights, oracle_labels = None, None, None with torch.no_grad(): loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv,labels=labels,ks_labels=ks_labels,train_vae=args.train_vae) masked_lm_loss, next_sentence_loss, KL_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. # loss = loss.mean() masked_lm_loss = masked_lm_loss.mean() next_sentence_loss = next_sentence_loss.mean() KL_loss = KL_loss.mean() # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) dev_iter_bar.set_description('Iter (loss=%5.3f)' % masked_lm_loss.item()) total_lm_loss += masked_lm_loss.item() total_kl_loss += KL_loss.item() # ensure that accumlated gradients are normalized total_mean_lm_loss = total_lm_loss /(qkr_dev_step + 1) total_mean_kl_loss = total_kl_loss / (qkr_dev_step + 1) logger.info("** ** * Evaling mean loss ** ** * ") logger.info("In{}epoch,dev_lm_loss:{}".format(i_epoch, total_mean_lm_loss)) logger.info("In{}epoch,dev_kl_loss:{}".format(i_epoch, total_mean_kl_loss)) logger.info("******************************************* ") logger.info("***** Running KS evaling *****") logger.info(" Batch size = %d", args.eval_batch_size) ks_dev_iter_bar = tqdm(ks_dev_reddit_dataloader, desc='Iter (loss=X.XXX)', disable=args.local_rank not in (-1, 0)) total_ks_loss = 0 total_ks_kl_loss = 0 for ks_dev_step, batch in enumerate(ks_dev_iter_bar): batch = [ t.to(device) if t is not None else None for t in batch] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx, labels, ks_labels = batch oracle_pos, oracle_weights, oracle_labels = None, None, None with torch.no_grad(): loss_tuple = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv,labels=labels,ks_labels=ks_labels, train_ks=True,train_vae=args.train_vae) ks_loss, KS_KL_loss = loss_tuple if n_gpu > 1: # mean() to average on multi-gpu. # loss = loss.mean() ks_loss = ks_loss.mean() KS_KL_loss = KS_KL_loss.mean() # logging for each step (i.e., before normalization by args.gradient_accumulation_steps) ks_dev_iter_bar.set_description('Iter (loss=%5.3f)' % ks_loss.item()) total_ks_loss += ks_loss.item() total_ks_kl_loss += KS_KL_loss.item() total_mean_ks_loss = total_ks_loss / (ks_dev_step + 1) total_mean_ks_kl_loss = total_ks_kl_loss / (ks_dev_step + 1) logger.info("** ** * Evaling mean loss ** ** * ") logger.info("In{}epoch,dev_ks_loss:{}".format(i_epoch, total_mean_ks_loss)) logger.info("In{}epoch,dev_ks_kl_loss:{}".format(i_epoch, total_mean_ks_kl_loss)) total_mean_loss = total_mean_lm_loss + total_mean_kl_loss + total_mean_ks_loss + total_mean_ks_kl_loss logger.info("In{}epoch,dev_loss:{}".format(i_epoch, total_mean_loss)) logger.info("******************************************* ") # Save a trained model if (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info( "** ** * Saving fine-tuned model and optimizer ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "model.{}_{}_{}.bin".format(i_epoch,step,round(total_mean_loss,4))) torch.save(model_to_save.state_dict(), output_model_file) output_optim_file = os.path.join( args.output_dir, "optim.bin") torch.save(optimizer.state_dict(), output_optim_file) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.do_predict: bi_uni_pipeline = [ seq2seq_loader.Preprocess4Seq2seq_predict(args.max_pred, args.mask_prob, list(tokenizer.vocab.keys( )), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={'max_len_a': args.max_len_a, 'max_len_b': args.max_len_b, 'trunc_seg': args.trunc_seg, 'always_truncate_tail': args.always_truncate_tail}, mask_source_words=args.mask_source_words, skipgram_prb=args.skipgram_prb, skipgram_size=args.skipgram_size, mask_whole_word=args.mask_whole_word, mode="s2s", has_oracle=args.has_sentence_oracle, num_qkv=args.num_qkv, s2s_special_token=args.s2s_special_token, s2s_add_segment=args.s2s_add_segment, s2s_share_segment=args.s2s_share_segment, pos_shift=args.pos_shift)] next_i = 0 model.eval() with open(os.path.join(args.data_dir, args.predict_input_file), "r", encoding="utf-8") as file: src_file = file.readlines() with open("train_tgt_pad.empty", "r", encoding="utf-8") as file: tgt_file = file.readlines() with open(os.path.join(args.data_dir, args.predict_output_file), "w", encoding="utf-8") as out: while next_i < len(src_file): print(next_i) batch_src = src_file[next_i:next_i + args.eval_batch_size] batch_tgt = tgt_file[next_i:next_i + args.eval_batch_size] next_i += args.eval_batch_size ex_list = [] for src, tgt in zip(batch_src, batch_tgt): src_tk = data_tokenizer.tokenize(src.strip()) tgt_tk = data_tokenizer.tokenize(tgt.strip()) ex_list.append((src_tk, tgt_tk)) batch = [] for idx in range(len(ex_list)): instance = ex_list[idx] for proc in bi_uni_pipeline: instance = proc(instance) batch.append(instance) batch_tensor = seq2seq_loader.batch_list_to_batch_tensors(batch) batch = [ t.to(device) if t is not None else None for t in batch_tensor] input_ids, segment_ids, input_mask, mask_qkv, lm_label_ids, masked_pos, masked_weights, is_next, task_idx = batch predict_bleu = args.predict_bleu * torch.ones([input_ids.shape[0]], device=input_ids.device) # B oracle_pos, oracle_weights, oracle_labels = None, None, None with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next, masked_pos=masked_pos, masked_weights=masked_weights, task_idx=task_idx, masked_pos_2=oracle_pos, masked_weights_2=oracle_weights, masked_labels_2=oracle_labels, mask_qkv=mask_qkv, labels=predict_bleu, train_ks=True,train_vae=args.train_vae) logits = torch.nn.functional.softmax(logits, dim=1) labels = logits[:, 1].cpu().numpy() # print(labels) for i in range(len(labels)): line = batch_src[i].strip() line += "\t" line += str(labels[i]) out.write(line) out.write("\n")
def init_optimizer_and_amp(model, learning_rate, loss_scale, warmup_proportion, num_train_optimization_steps, use_fp16): param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer, scheduler = None, None if use_fp16: logger.info("using fp16") try: from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex to use " "distributed and fp16 training.") if num_train_optimization_steps is not None: optimizer = FusedAdam( optimizer_grouped_parameters, lr=learning_rate, bias_correction=False, ) amp_inits = amp.initialize( model, optimizers=optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic" if loss_scale == 0 else loss_scale, ) model, optimizer = (amp_inits if num_train_optimization_steps is not None else (amp_inits, None)) if num_train_optimization_steps is not None: scheduler = LinearWarmUpScheduler( optimizer, warmup=warmup_proportion, total_steps=num_train_optimization_steps, ) else: logger.info("using fp32") if num_train_optimization_steps is not None: optimizer = BertAdam( optimizer_grouped_parameters, lr=learning_rate, warmup=warmup_proportion, t_total=num_train_optimization_steps, ) return model, optimizer, scheduler
def main(): parser = argparse.ArgumentParser() # # 必要参数 parser.add_argument('--task', default='multi', type=str, help='Task affecting load data and vectorize feature') parser.add_argument( '--loss_type', default='double', type=str, help='Select loss double or single, only for multi task' ) # 针对multi任务才有效 parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help= "Bert pre-trained model selected in the list: bert-base-uncased,bert-large-uncased, " "bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,bert-base-chinese," "bert-base-multilingual-cased.") # 选择预训练模型参数 parser.add_argument("--debug", default=False, help="Whether run on small dataset") # 正常情况下都应该选择false parser.add_argument( "--output_dir", default="./SQuAD/output/", type=str, help= "The output directory where the model checkpoints and predictions will be written." ) # # 其他参数 parser.add_argument("--train_file", default="./SQuAD/version/train.json", type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default="./SQuAD/version/prediction.json", type=str, help= "SQuAD json for predictio ns. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will be " "truncated to this length.") # # 控制参数 parser.add_argument("--do_train", default=True, help="Whether to run training.") parser.add_argument("--do_predict", default=True, help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=18, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=18, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json file." ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated.This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, help= "If true, all of the warnings related to data processing will be printed.A number of " "warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.Positive power of 2: static loss scaling value.\n" ) parser.add_argument( '--version_2_with_negative', default=False, help= 'If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() # if是采用单机形式,else采用的是分布式形式;因为我们没有分布式系统,所以采用单机多GPU的方式进行训练10.24 if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='hierarchical_copy') # 以下三句话的意义不是很大,基本操作这一部分是日志的输出形式10.24 logging.basicConfig( format='%(asctime)s-%(levelname)s-%(name)s-%(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device:{}, n_gpu:{}, distributed training:{}, 16-bits training:{}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # 以下几行均是用来设置参数10.24 args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) # 设置随机种子 np.random.seed(args.seed) # 设置随机种子 torch.manual_seed(args.seed) # 为CPU设置种子用于生成随机数,以使得结果是确定的 if n_gpu > 0: # 如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子 torch.cuda.manual_seed_all(args.seed) # 以下三句又是基本操作,意义不大10.24 if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) # 以下2句是用来判断output_dir是否存在,若不存在,则创建即可(感觉有这个东西反而不太好,因为需要空文件夹)10.24 # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # 这个东西是用来干啥的(从tokenization中读取,对Tokenizer进行初始化操作)10.24 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # 从data中读取数据的方式,一种是单队列的读取方式,另一种是多通道读取方式10.24 if args.task == 'squad': read_examples = read_squad_examples elif args.task == 'multi': read_examples = read_multi_examples # 用来加载训练样例以及优化的步骤10.24 train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) if args.debug: train_examples = train_examples[:100] num_train_optimization_steps = \ int(len(train_examples)/args.train_batch_size/args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # 模型准备中ing10.24 model = BertForQuestionAnswering.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) # model = torch.nn.DataParallel(model).cuda() # 判断是否使用float16编码10.24 if args.fp16: # model.half().cuda() model.half() # 将模型加载到相应的CPU或者GPU中10.24 model.to(device) # 配置优化器等函数10.24 if args.do_train: param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from apex.optimizers import FP16_Optimizer from apex.fp16_utils import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=True) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # 进行模型的拟合训练10.24 global_step = 0 if args.do_train: # 训练语料的特征提取 train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) all_start_vector = torch.tensor( [f.start_vector for f in train_features], dtype=torch.float) all_end_vector = torch.tensor([f.end_vector for f in train_features], dtype=torch.float) all_content_vector = torch.tensor( [f.content_vector for f in train_features], dtype=torch.float) # # 替换的内容all_start_positions以及all_end_positions # all1_start_positions = [] # for i in range(len(train_features)): # for j in range(len(train_features[i].start_position)): # all1_start_positions.append(train_features[i].start_position[j]) # all_start_positions = torch.tensor([k for k in all1_start_positions], dtype=torch.long) # all1_end_positions = [] # for i in range(len(train_features)): # for j in range(len(train_features[i].end_position)): # all1_end_positions.append(train_features[i].end_position[j]) # all_end_positions = torch.tensor([k for k in all1_end_positions], dtype=torch.long) # #################################################################### train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_start_vector, all_end_vector, all_content_vector) if args.local_rank == -1: train_sampler = RandomSampler(train_data) # 随机采样器 else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ep in trange(int(args.num_train_epochs), desc="Epoch"): # 每次都叫他进行分发,这样的话,就可以进行多GPU训练 model = torch.nn.DataParallel(model).cuda() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions, start_vector, end_vector, content_vector = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions, start_vector, end_vector, content_vector, args.loss_type) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. print("loss率为:{}".format(loss)) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print("\n") print(ep) output_model_file = os.path.join(args.output_dir, str(ep) + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, str(ep) + CONFIG_NAME) torch.save(model.state_dict(), output_model_file) if isinstance(model, torch.nn.DataParallel): model = model.module model.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # 这个是用来加载进行微调调好后的代码以方便进行预测10.25 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) # 再次将GPU加入10.25 model.to(device) # 这部分就是进行相应的预测(用于生成预测文件) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = \ read_examples(input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) if args.debug: eval_examples = eval_examples[:100] eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) middle_result = os.path.join(args.output_dir, 'middle_result.pkl') pickle.dump([eval_examples, eval_features, all_results], open(middle_result, 'wb')) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") if (args.loss_type == 'double'): write_predictions_couple_labeling( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) elif (args.loss_type == 'single'): write_predictions_single_labeling( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) elif (args.loss_type == 'origin') or (args.task == 'multi' and args.loss_type == 'squad'): write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) else: raise ValueError('{} dataset and {} loss is not support'.format( args.task, args.loss_type))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--word_embedding_file", default='./emb/wiki-news-300d-1M.vec', type=str, help="The input directory of word embeddings.") parser.add_argument("--index_path", default='./emb/p_index.bin', type=str, help="The input directory of word embedding index.") parser.add_argument("--word_embedding_info", default='./emb/vocab_info.txt', type=str, help="The input directory of word embedding info.") parser.add_argument("--data_file", default='', type=str, help="The input directory of input data file.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--max_ngram_length", default=16, type=int, help="The maximum total ngram sequence") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--embedding_size", default=300, type=int, help="Total batch size for embeddings.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--num_eval_epochs", default=3.0, type=float, help="Total number of eval epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--single', action='store_true', help="Whether only evaluate a single epoch") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Comment the if else block for no CUDA if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #device = torch.device("cpu") # uncomment this for no GPU logger.info( "device: {} , distributed training: {}, 16-bits training: {}".format( device, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: # Comment this to No GPU torch.cuda.manual_seed_all(args.seed) # Comment this for No GPU if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None w2i, i2w, vocab_size = {}, {}, 1 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) train_features, w2i, i2w, vocab_size = convert_examples_to_features_disc_train( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Num token vocab = %d", vocab_size) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_tokens = torch.tensor([f.token_ids for f in train_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in train_features], dtype=torch.long) # load embeddings sa if args.do_train: logger.info("Loading word embeddings ... ") emb_dict, emb_vec, vocab_list, emb_vocab_size = load_vectors( args.word_embedding_file) if not os.path.exists(args.index_path): write_vocab_info(args.word_embedding_info, emb_vocab_size, vocab_list) p = load_embeddings_and_save_index(range(emb_vocab_size), emb_vec, args.index_path) else: #emb_vocab_size, vocab_list = load_vocab_info(args.word_embedding_info) p = load_embedding_index(args.index_path, emb_vocab_size, num_dim=args.embedding_size) #emb_dict, emb_vec, vocab_list, emb_vocab_size, p = None, None, None, None, None # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForDiscriminator.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: # Comment this for NO GPU model = torch.nn.DataParallel(model) # Comment this for NO GPU # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 1 tr_loss = 0 if args.do_train: train_data = TensorDataset(all_tokens, all_label_id) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for ind in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 flaw_eval_f1 = [] flaw_eval_recall = [] flaw_eval_precision = [] for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) tokens, _ = batch #, label_id, ngram_ids, ngram_labels, ngram_masks # module1: learn a discriminator tokens = tokens.to('cpu').numpy() #print("PRINTING TOKENS!!!!!!!!! ", len(tokens[0])) train_features = convert_examples_to_features_flaw( tokens, args.max_seq_length, args.max_ngram_length, tokenizer, i2w, emb_dict, p, vocab_list) flaw_mask = torch.tensor([f.flaw_mask for f in train_features], dtype=torch.long).to( device) # [1, 1, 1, 1, 0,0,0,0] flaw_ids = torch.tensor([f.flaw_ids for f in train_features], dtype=torch.long).to( device) # [12,25,37,54,0,0,0,0] flaw_labels = torch.tensor( [f.flaw_labels for f in train_features], dtype=torch.long).to(device) # [0, 1, 1, 1, 0,0,0,0] loss, logits = model(flaw_ids, flaw_mask, flaw_labels) logits = logits.detach().cpu().numpy() if n_gpu > 1: # Comment this for NO GPU loss = loss.mean() # Comment this for NO GPU if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += flaw_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # eval during training flaw_labels = flaw_labels.to('cpu').numpy() flaw_tmp_eval_f1, flaw_tmp_eval_recall, flaw_tmp_eval_precision = f1_3d( logits, flaw_labels) flaw_eval_f1.append(flaw_tmp_eval_f1) flaw_eval_recall.append(flaw_tmp_eval_recall) flaw_eval_precision.append(flaw_tmp_eval_precision) nb_eval_examples += flaw_ids.size(0) nb_eval_steps += 1 flaw_f1 = sum(flaw_eval_f1) / len(flaw_eval_f1) flaw_recall = sum(flaw_eval_recall) / len(flaw_eval_recall) flaw_precision = sum(flaw_eval_precision) / len( flaw_eval_precision) loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'flaw_f1': flaw_f1, "flaw_recall": flaw_recall, "flaw_precision": flaw_precision, 'loss': loss, } output_eval_file = os.path.join(args.output_dir, "train_results.txt") with open(output_eval_file, "a") as writer: #logger.info("***** Training results *****") writer.write("epoch" + str(ind) + '\n') for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('\n') model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "epoch" + str(ind) + WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) os.rename( output_model_file, os.path.join(args.output_dir, "disc_trained_" + WEIGHTS_NAME)) current_path = os.path.join(args.output_dir, "disc_trained_" + WEIGHTS_NAME) new_path = os.path.join('./models', "disc_trained_" + WEIGHTS_NAME) new_path_config = os.path.join('./models' + CONFIG_NAME) shutil.move(current_path, new_path) shutil.move(output_config_file, new_path_config) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # for trouble-shooting eval_examples = processor.get_disc_dev_examples(args.data_file) eval_features, w2i, i2w, vocab_size = convert_examples_to_features_disc_eval( eval_examples, label_list, args.max_seq_length, tokenizer, w2i, i2w, vocab_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Num token vocab = %d", vocab_size) logger.info(" Batch size = %d", args.eval_batch_size) all_token_ids = torch.tensor([f.token_ids for f in eval_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_flaw_labels = torch.tensor([f.flaw_labels for f in eval_features], dtype=torch.long) all_flaw_ids = torch.tensor([f.flaw_ids for f in eval_features], dtype=torch.long) all_label_id = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_chunks = torch.tensor([f.chunks for f in eval_features], dtype=torch.long) #print("flaw ids in eval_features: ", all_flaw_ids) eval_data = TensorDataset(all_token_ids, all_input_ids, all_input_mask, all_flaw_ids, all_flaw_labels, all_label_id, all_chunks) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Load a trained model and config that you have fine-tuned if args.single: eval_range = trange(int(args.num_eval_epochs), int(args.num_eval_epochs + 1), desc="Epoch") else: eval_range = trange(int(args.num_eval_epochs), desc="Epoch") attack_type = 'rand' for epoch in eval_range: output_file = os.path.join( args.data_dir, "epoch" + str(epoch) + "disc_eval_outputs_" + attack_type + ".tsv") with open(output_file, "w") as csv_file: writer = csv.writer(csv_file, delimiter='\t') writer.writerow(["sentence", "label", "ids"]) #output_model_file = os.path.join(args.output_dir, "epoch"+str(epoch)+WEIGHTS_NAME) output_model_file = os.path.join(args.output_dir, "disc_trained_" + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) #print("output_model_file: ", output_model_file) config = BertConfig(output_config_file) model = BertForDiscriminator(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) model.eval() predictions, truths = [], [] eval_loss, nb_eval_steps, nb_eval_examples = 0, 0, 0 eval_accuracy = 0 for token_ids, input_ids, input_mask, flaw_ids, flaw_labels, label_id, chunks in tqdm( eval_dataloader, desc="Evaluating"): token_ids = token_ids.to(device) input_ids = input_ids.to(device) input_mask = input_mask.to(device) flaw_labels = flaw_labels.to(device) flaw_ids = flaw_ids.to(device) #print("flaw ids in eval_dataloader: ", flaw_ids) with torch.no_grad(): tmp_eval_loss, s = model(input_ids, input_mask, flaw_labels) # print("tmp_eval_loss: ",tmp_eval_loss) # print("s: ",s) logits = model(input_ids, input_mask) print("len of logits: ", len(logits)) print("shape of logits: ", logits.size()) print("type of logits: ", type(logits)) print("type of logits: ", logits) flaw_logits = torch.argmax(logits, dim=2) print("Type of flaw_logits: ", type(flaw_logits)) print("shape of flaw_logits: ", flaw_logits.size()) print("Length of flaw_logits: ", len(flaw_logits)) print("flaw_logits: ", flaw_logits) logits = logits.detach().cpu().numpy() flaw_logits = flaw_logits.detach().cpu().numpy() flaw_ids = flaw_ids.to('cpu').numpy() label_id = label_id.to('cpu').numpy() chunks = chunks.to('cpu').numpy() token_ids = token_ids.to('cpu').numpy() flaw_logits = logit_converter( flaw_logits, chunks) # each word only has one '1' print("Type of flaw_logits logit_converter: ", type(flaw_logits)) #print("shape of flaw_logits logit_converter : ",flaw_logits.size()) print("Length of flaw_logits logit_converter : ", len(flaw_logits)) print("flaw_logits logit_converter : ", flaw_logits) true_logits = [] #print("length of flaw_ids: ",len(flaw_ids)) for i in range(len(flaw_ids)): tmp = [0] * len(flaw_logits[i]) #print("tmp: ",tmp) # ne line #print("printing i:",i) #print("len of tmp: ",len(tmp)) #print("length of flaw_ids of i : ",len(flaw_ids[i])) #print("flaw_ids[i]: ",flaw_ids[i]) for j in range(len(flaw_ids[0])): #print("flaw_ids[i][j] : ",flaw_ids[i][j]) #print("tmp value: ", tmp) #print("tmp len: ", len(tmp)) if flaw_ids[i][j] == 0: break if flaw_ids[i][j] >= len(tmp): continue tmp[flaw_ids[i][j]] = 1 true_logits.append(tmp) #print('true_logits: ', true_logits) tmp_eval_accuracy = accuracy_2d(flaw_logits, true_logits) eval_accuracy += tmp_eval_accuracy predictions += true_logits # Original truths += flaw_logits # Original #predictions += flaw_logits # for trouble-shooting #truths += true_logits # for trouble-shooting eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 with open(output_file, "a") as csv_file: for i in range(len(label_id)): #print("i in write output file:",i) token = ' '.join( [i2w[x] for x in token_ids[i] if x != 0]) flaw_logit = flaw_logits[i] #print("flaw_logit in write output file: ",flaw_logit) label = str(label_id[i]) logit = ','.join([ str(i) for i, x in enumerate(flaw_logit) if x == 1 ]) # for trouble-shooting logit = '-1' if logit == '' else logit # for trouble-shooting writer = csv.writer(csv_file, delimiter='\t') writer.writerow([token, label, logit]) # Renaming and moving the file for Embedding Estimator eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_steps eval_f1_score, eval_recall_score, eval_precision_score = f1_2d( truths, predictions) loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_f1': eval_f1_score, 'eval_recall': eval_recall_score, 'eval_precision': eval_precision_score, 'eval_acc': eval_accuracy } output_eval_file = os.path.join( args.output_dir, "disc_eval_results_" + attack_type + "_attacks.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) #attack_type='drop' new_path = os.path.join( args.data_dir, "disc_eval_outputs_" + attack_type + ".tsv") current_path = os.path.join( args.data_dir, "epoch" + str(epoch) + "disc_eval_outputs_" + attack_type + ".tsv") os.rename(current_path, new_path)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--pretrain', action='store_true', help="Whether to load a pre-trained model for continuing training") parser.add_argument('--pretrained_model_file', type=str, help="The path of the pretrained_model_file") parser.add_argument( "--percent", default=100, type=float, help="The percentage of examples used in the training data.\n") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) train_examples = train_examples[:int( len(train_examples) * args.percent / 100)] num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = ConditionalSemanticBertForQuestionAnswering.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) model.QABert = BertModel.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.pretrain: # Load a pre-trained model print('load a pre-trained model from ' + args.pretrained_model_file) pretrained_state_dict = torch.load(args.pretrained_model_file) model_state_dict = model.state_dict() print( 'QABert', model_state_dict[ 'QABert.encoder.layer.4.attention.self.value.weight']) print( 'bert', model_state_dict[ 'bert.encoder.layer.4.attention.self.value.weight']) print('pretrained_state_dict', pretrained_state_dict.keys()) print('model_state_dict', model_state_dict.keys()) pretrained_state = { k: v for k, v in pretrained_state_dict.items() if k in model_state_dict and v.size() == model_state_dict[k].size() } print('stored pretrained dict', pretrained_state.keys()) model_state_dict.update(pretrained_state) print('updated_state_dict', model_state_dict.keys()) model.load_state_dict(model_state_dict) model.to(device) # for param in model.bert.parameters(): # param.requires_grad = False if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if t_total is None: t_total = -1 if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: # with open(cached_train_features_file, "rb") as reader: with open('not load features', "rb") as reader: train_features = pickle.load(reader) print('not load stored features, but generate itself') except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): total_loss = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps total_loss += loss if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 print('loss', total_loss) # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = ConditionalSemanticBertForQuestionAnswering.from_pretrained( args.bert_model, state_dict=model_state_dict) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--pretrained_model_path", default=None, type=str, help="Pretrained basic Bert model") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="triviaqa train file") parser.add_argument("--predict_file", default=None, type=str, help="triviaqa dev or test file in SQuAD format") parser.add_argument("--predict_data_file", default=None, type=str, help="triviaqa dev or test file in Triviaqa format") # history queries parameters parser.add_argument("--use_history", default=False, action="store_true") parser.add_argument( "--append_history", default=False, action="store_true", help="Whether to append the previous queries to the current one.") parser.add_argument( "--n_history", default=-1, type=int, help="The number of previous queries used in current query.") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_validate", action='store_true', help="Whether to run validation when training") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") # supervised & reinforcement learning parser.add_argument("--supervised_pretraining", action='store_true', help="Whether to do supervised pretraining.") #parser.add_argument("--reload_model_path", type=str, help="Path of pretrained model.") parser.add_argument("--recur_type", type=str, default="gated", help="Recurrence model type.") # model parameters parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_read_times", default=6, type=int, help="Maximum read times of one document") parser.add_argument("--stop_loss_weight", default=1.0, type=float, help="The weight of stop_loss in training") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=2, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model if args.pretrained_model_path is not None and os.path.isfile( args.pretrained_model_path): logger.info("Reloading pretrained model from {}".format( args.pretrained_model_path)) model_state_dict = torch.load(args.pretrained_model_path) model = RCMBert.from_pretrained(args.bert_model, state_dict=model_state_dict, action_num=len(stride_action_space), recur_type=args.recur_type, allow_yes_no=False) else: logger.info("Training a new model from scratch") model = RCMBert.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank), action_num=len(stride_action_space), recur_type=args.recur_type, allow_yes_no=False) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] no_decay += ['recur_network', 'stop_network', 'move_stride_network'] logger.info("Parameter without decay: {}".format(no_decay)) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] """ examples & features """ train_examples = None dev_examples = None num_train_steps = None if args.do_train: cached_train_examples_file = args.train_file + '_train_examples' cached_dev_examples_file = args.train_file + '_dev_examples' try: with open(cached_train_examples_file, "rb") as reader: train_examples = pickle.load(reader) with open(cached_dev_examples_file, "rb") as reader: dev_examples = pickle.load(reader) logger.info("Loading train and dev examples...") except: all_train_examples = read_quac_examples( input_file=args.train_file, is_training=True, use_history=args.use_history, n_history=args.n_history) train_examples, dev_examples = split_train_dev_data( all_train_examples) with open(cached_train_examples_file, "wb") as writer: pickle.dump(train_examples, writer) with open(cached_dev_examples_file, "wb") as writer: pickle.dump(dev_examples, writer) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if args.do_train: cached_train_features_file = args.train_file + '_{0}_{1}_RCM_train'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_query_length)) cached_dev_features_file = args.train_file + '_{0}_{1}_RCM_dev'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_query_length)) train_features = None dev_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) with open(cached_dev_features_file, "rb") as reader: dev_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_query_length=args.max_query_length, is_training=True, append_history=args.append_history) dev_features = convert_examples_to_features( examples=dev_examples, tokenizer=tokenizer, max_query_length=args.max_query_length, is_training=True, append_history=args.append_history) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) logger.info(" Saving dev features into cached file %s", cached_dev_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) with open(cached_dev_features_file, "wb") as writer: pickle.dump(dev_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) if args.do_validate and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("***** Dev data *****") logger.info(" Num orig dev examples = %d", len(dev_examples)) logger.info(" Num split dev examples = %d", len(dev_features)) logger.info(" Batch size = %d", args.predict_batch_size) dev_evaluator = QuACEvaluator(dev_examples) train_model(args, model, tokenizer, optimizer, train_examples, train_features, dev_examples, dev_features, dev_evaluator, device, n_gpu, t_total) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # load model output_model_file = os.path.join(args.output_dir, "best_RCM_model.bin") model_state_dict = torch.load(output_model_file) model = RCMBert.from_pretrained(args.bert_model, state_dict=model_state_dict, action_num=len(stride_action_space), recur_type=args.recur_type, allow_yes_no=False) model.to(device) # load data test_examples = read_quac_examples(input_file=args.predict_file, is_training=False, use_history=args.use_history, n_history=args.n_history) cached_test_features_file = args.predict_file + '_{0}_{1}_RCM_test'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_query_length)) test_features = None try: with open(cached_test_features_file, "rb") as reader: test_features = pickle.load(reader) except: test_features = convert_examples_to_features( examples=test_examples, tokenizer=tokenizer, max_query_length=args.max_query_length, is_training=False, append_history=args.append_history) with open(cached_test_features_file, "wb") as writer: pickle.dump(test_features, writer) logger.info("***** Prediction data *****") logger.info(" Num test orig examples = %d", len(test_examples)) logger.info(" Num test split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) test_model(args, model, tokenizer, test_examples, test_features, device)
def run(self, repeats=1): # Loss and Optimizer criterion = nn.CrossEntropyLoss() _params = filter(lambda p: p.requires_grad, self.model.parameters()) optimizer = self.opt.optimizer(_params, lr=self.opt.learning_rate, weight_decay=self.opt.l2reg) if not os.path.exists('log/'): os.mkdir('log/') f_out = open('log/' + self.opt.model_name + '_' + self.opt.dataset + '_val.txt', 'w', encoding='utf-8') max_test_acc_avg = 0 max_test_f1_avg = 0 for i in range(repeats): print('repeat: ', (i + 1)) f_out.write('repeat: ' + str(i + 1)) self._reset_params() self.model.bert = bert_model self.model.cuda() if self.opt.mode == "train": parameters = [ p for name, p in self.model.named_parameters() if 'bert' in name ] parameters = [p for name, p in self.model.named_parameters()] named_params = [(name, p) for name, p in self.model.named_parameters()] optimizer1 = torch.optim.Adam(parameters, lr=0.00005, weight_decay=0.00001) optimizer_grouped_parameters = [{ 'params': parameters, 'weight_decay': 0.01 }] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in named_params if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in named_params if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] if self.opt.usebert: optimizer1 = BertAdam( optimizer_grouped_parameters, lr=0.00005, warmup=0.1, t_total=self.train_data_loader.batch_len * 20) else: optimizer1 = torch.optim.Adam(parameters, lr=0.001, weight_decay=0.00001) print('not use bert') max_test_acc, max_test_f1 = self._train( criterion, optimizer, optimizer1) print('max_test_acc: {0} max_test_f1: {1}'.format( max_test_acc, max_test_f1)) f_out.write('max_test_acc: {0}, max_test_f1: {1}'.format( max_test_acc, max_test_f1)) max_test_acc_avg += max_test_acc max_test_f1_avg += max_test_f1 print('#' * 100) print("max_test_acc_avg:", max_test_acc_avg / repeats) print("max_test_f1_avg:", max_test_f1_avg / repeats) # torch.save(self.model.state_dict(),self.opt.model_name+'_'+self.opt.dataset+'.pth') f_out.close() else: self.model.load_state_dict( torch.load('state_dict/' + self.opt.model_name + '_' + self.opt.dataset + '.pkl')) test_acc, test_f1 = self._evaluate_acc_f1withmore() print("max_test_acc_avg:", test_acc / repeats) print("max_test_f1_avg:", test_f1 / repeats) f_out.close()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--load_finetuned_model', action='store_true', default=False, help="Load finetuned model.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "compq": COMPQProcessor, } output_modes = { "compq": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) if args.load_finetuned_model: print("Loading finetuned model....") model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] softmax_preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) softmax_preds.append(Softmax(1)(logits).detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) softmax_preds[0] = np.append( softmax_preds[0], Softmax(1)(logits).detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] softmax_preds = softmax_preds[0] output_prediction_file = os.path.join(args.output_dir, "predictions.txt") with open(output_prediction_file, 'w') as writer: for i, pred in enumerate(softmax_preds): writer.write( str(pred[0]) + '\t' + str(pred[1]) + '\t' + eval_examples[i].text_a + '\n') if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, choices=["WSD"], help="The name of the task to train.") parser.add_argument("--train_data_dir", default=None, type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--eval_data_dir", default=None, type=str, help="The label data dir. (./wordnet)") parser.add_argument("--label_data_dir", default=None, type=str, required=True, help="The label data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") parser.add_argument("--bert_model", default=None, type=str, required=True, help='''a path or url to a pretrained model archive containing: 'bert_config.json' a configuration file for the model 'pytorch_model.bin' a PyTorch dump of a BertForPreTraining instance''') ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_test: raise ValueError("At least one of `do_train` or `do_test` must be True.") if args.do_train: assert args.train_data_dir != None, "train_data_dir can not be None" if args.do_eval: assert args.eval_data_dir != None, "eval_data_dir can not be None" if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) # prepare dataloaders processors = { "WSD":WSDProcessor } output_modes = { "WSD": "classification" } processor = processors[args.task_name]() output_mode = output_modes[args.task_name] label_list = processor.get_labels(args.label_data_dir) num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # training set train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.train_data_dir, args.label_data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForTokenClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) # load data if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_target_mask = torch.tensor([f.target_mask for f in train_features], dtype=torch.long) all_index_start = torch.tensor([f.index_start for f in train_features], dtype=torch.long) all_index_end = torch.tensor([f.index_end for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_target_mask, all_index_start, all_index_end) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: eval_examples = processor.get_dev_examples(args.eval_data_dir, args.label_data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_target_mask = torch.tensor([f.target_mask for f in eval_features], dtype=torch.long) all_label_mask = torch.tensor([f.label_mask for f in eval_features], dtype=torch.float) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_target_mask, all_label_mask) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False) # train global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: model.train() epoch = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): epoch += 1 tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, target_mask, index_start, index_end = batch all_label_mask = [] for i in range(len(index_start)): label_mask = [float("-inf")] * len(label_list) for i in range(index_start[i][0].item(), index_end[i][0].item()): label_mask[i] = 0 all_label_mask.append(label_mask) all_label_mask = torch.tensor(all_label_mask, dtype=torch.float).to(device) logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None, target_mask=target_mask) logits = logits + all_label_mask logits = F.softmax(logits, dim=-1) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` model_output_dir = os.path.join(args.output_dir, str(epoch)) if not os.path.exists(model_output_dir): os.makedirs(model_output_dir) output_model_file = os.path.join(model_output_dir, WEIGHTS_NAME) output_config_file = os.path.join(model_output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(model_output_dir) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results_"+str(epoch)+".txt"),"w") as f: for input_ids, input_mask, segment_ids, label_ids, target_mask, label_mask in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) target_mask = target_mask.to(device) label_mask = label_mask.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None, target_mask=target_mask) logits = logits + label_mask logits_ = F.softmax(logits, dim=-1) logits_ = logits_.detach().cpu().numpy() label_ids_ = label_ids.to('cpu').numpy() outputs = np.argmax(logits_, axis=1) for output_i in range(len(outputs)): f.write(str(outputs[output_i])) f.write("\n") tmp_eval_accuracy = np.sum(outputs == label_ids_) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss/nb_tr_steps if args.do_train else None result = OrderedDict() result['eval_loss'] = eval_loss result['eval_accuracy'] = eval_accuracy result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: writer.write("epoch=%s\n"%str(epoch)) logger.info("***** Eval results *****") for key in result.keys(): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.eval_data_dir, args.label_data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_target_mask = torch.tensor([f.target_mask for f in eval_features], dtype=torch.long) all_label_mask = torch.tensor([f.label_mask for f in eval_features], dtype=torch.float) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_target_mask, all_label_mask) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size, shuffle=False) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 with open(os.path.join(args.output_dir, "results.txt"),"w") as f: for input_ids, input_mask, segment_ids, label_ids, target_mask, label_mask in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) target_mask = target_mask.to(device) label_mask = label_mask.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None, target_mask=target_mask) logits = logits + label_mask logits_ = F.softmax(logits, dim=-1) logits_ = logits_.detach().cpu().numpy() label_ids_ = label_ids.to('cpu').numpy() outputs = np.argmax(logits_, axis=1) for output_i in range(len(outputs)): f.write(str(outputs[output_i])) f.write("\n") tmp_eval_accuracy = np.sum(outputs == label_ids_) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss/nb_tr_steps if args.do_train else None result = OrderedDict() result['eval_loss'] = eval_loss result['eval_accuracy'] = eval_accuracy result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a+") as writer: logger.info("***** Eval results *****") for key in result.keys(): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train_and_evaluate(model, params, restore_file=None): """Train the model and evaluate every epoch.""" # load args args = parser.parse_args() # Load training data and val data dataloader = NERDataLoader(params) train_loader = dataloader.get_dataloader(data_sign='train') val_loader = dataloader.get_dataloader(data_sign='val') # 一个epoch的步数 params.train_steps = len(train_loader) # Prepare optimizer # fine-tuning # 取模型权重 param_optimizer = list(model.named_parameters()) # pretrain model param param_pre = [(n, p) for n, p in param_optimizer if 'bert' in n] # middle model param param_middle = [(n, p) for n, p in param_optimizer if 'bilstm' in n or 'dym_weight' in n] # crf param param_crf = [p for n, p in param_optimizer if 'crf' in n] # 不进行衰减的权重 no_decay = ['bias', 'LayerNorm', 'dym_weight', 'layer_norm'] # 将权重分组 optimizer_grouped_parameters = [ # pretrain model param # 衰减 {'params': [p for n, p in param_pre if not any(nd in n for nd in no_decay)], 'weight_decay': params.weight_decay_rate, 'lr': params.fin_tuning_lr }, # 不衰减 {'params': [p for n, p in param_pre if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': params.fin_tuning_lr }, # middle model # 衰减 {'params': [p for n, p in param_middle if not any(nd in n for nd in no_decay)], 'weight_decay': params.weight_decay_rate, 'lr': params.middle_lr }, # 不衰减 {'params': [p for n, p in param_middle if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, 'lr': params.middle_lr }, # crf,单独设置学习率 {'params': param_crf, 'weight_decay': 0.0, 'lr': params.crf_lr} ] num_train_optimization_steps = len(train_loader) // params.gradient_accumulation_steps * args.epoch_num optimizer = BertAdam(optimizer_grouped_parameters, warmup=params.warmup_prop, schedule="warmup_cosine", t_total=num_train_optimization_steps, max_grad_norm=params.clip_grad) # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(params.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) # 读取checkpoint utils.load_checkpoint(restore_path, model, optimizer) # patience stage best_val_f1 = 0.0 patience_counter = 0 for epoch in range(1, args.epoch_num + 1): # Run one epoch logging.info("Epoch {}/{}".format(epoch, args.epoch_num)) # Train for one epoch on training set train(model, train_loader, optimizer, params) # Evaluate for one epoch on training set and validation set # train_metrics = evaluate(model, train_loader, params, mark='Train', # verbose=True) # Dict['loss', 'f1'] val_metrics = evaluate(args, model, val_loader, params, mark='Val', verbose=True) # Dict['loss', 'f1'] # 验证集f1-score val_f1 = val_metrics['f1'] # 提升的f1-score improve_f1 = val_f1 - best_val_f1 # Save weights of the network model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self optimizer_to_save = optimizer utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model_to_save.state_dict(), 'optim_dict': optimizer_to_save.state_dict()}, is_best=improve_f1 > 0, checkpoint=params.model_dir) params.save(params.params_path / 'params.json') # stop training based params.patience if improve_f1 > 0: logging.info("- Found new best F1") best_val_f1 = val_f1 if improve_f1 < params.patience: patience_counter += 1 else: patience_counter = 0 else: patience_counter += 1 # Early stopping and logging best f1 if (patience_counter > params.patience_num and epoch > params.min_epoch_num) or epoch == args.epoch_num: logging.info("Best val f1: {:05.2f}".format(best_val_f1)) break
def main(): print("IN NEW MAIN XD\n") parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--input_dir", default=None, type=str, required=True, help="The input data dir. Should contain .hdf5 files for the task.") parser.add_argument("--config_file", default=None, type=str, required=True, help="The BERT model config") parser.add_argument( "--bert_model", default="bert-large-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--max_predictions_per_seq", default=80, type=int, help="The maximum total of masked tokens in input sequence") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=1000, type=float, help="Total number of training steps to perform.") parser.add_argument( "--warmup_proportion", default=0.01, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0.0, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) parser.add_argument('--log_freq', type=float, default=50.0, help='frequency of logging loss.') parser.add_argument('--checkpoint_activations', default=False, action='store_true', help="Whether to use gradient checkpointing") parser.add_argument("--resume_from_checkpoint", default=False, action='store_true', help="Whether to resume training from checkpoint.") parser.add_argument('--resume_step', type=int, default=-1, help="Step to resume training from.") parser.add_argument( '--num_steps_per_checkpoint', type=int, default=2000, help="Number of update steps until a model checkpoint is saved to disk." ) args = parser.parse_args() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) assert (torch.cuda.is_available()) if args.local_rank == -1: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if args.train_batch_size % args.gradient_accumulation_steps != 0: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible" .format(args.gradient_accumulation_steps, args.train_batch_size)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.resume_from_checkpoint and os.path.exists( args.output_dir) and (os.listdir(args.output_dir) and os.listdir( args.output_dir) != ['logfile.txt']): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not args.resume_from_checkpoint: os.makedirs(args.output_dir, exist_ok=True) # Prepare model config = BertConfig.from_json_file(args.config_file) model = BertForPreTraining(config) if not args.resume_from_checkpoint: global_step = 0 else: if args.resume_step == -1: model_names = [ f for f in os.listdir(args.output_dir) if f.endswith(".pt") ] args.resume_step = max([ int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names ]) global_step = args.resume_step checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu") model.load_state_dict(checkpoint['model'], strict=False) print("resume step from ", args.resume_step) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, #warmup=args.warmup_proportion, #t_total=args.max_steps, bias_correction=False, weight_decay=0.01, max_grad_norm=1.0) if args.loss_scale == 0: # optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic") else: # optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale) scheduler = LinearWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=args.max_steps) if args.resume_from_checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False) if args.local_rank != -1: model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) ] files.sort() num_files = len(files) logger.info("***** Running training *****") # logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) print(" LR = ", args.learning_rate) model.train() print("Training. . .") most_recent_ckpts_paths = [] print("Training. . .") tr_loss = 0.0 # total added training loss average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 while True: if not args.resume_from_checkpoint: random.shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False for f_id in range(f_start_id, len(files)): data_file = files[f_id] logger.info("file no %s file %s" % (f_id, data_file)) train_data = pretraining_dataset( input_file=data_file, max_pred_length=args.max_predictions_per_seq) if args.local_rank == -1: train_sampler = RandomSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size * n_gpu, num_workers=4, pin_memory=True) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=4, pin_memory=True) for step, batch in enumerate( tqdm(train_dataloader, desc="File Iteration")): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch #\ loss = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels, checkpoint_activations=args.checkpoint_activations) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: # optimizer.backward(loss) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: if args.fp16: scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if training_steps == 1 * args.gradient_accumulation_steps: logger.info( "Step:{} Average Loss = {} Step Loss = {} LR {}". format(global_step, average_loss, loss.item(), optimizer.param_groups[0]['lr'])) if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: logger.info( "Step:{} Average Loss = {} Step Loss = {} LR {}". format(global_step, average_loss / args.log_freq, loss.item(), optimizer.param_groups[0]['lr'])) average_loss = 0 if global_step >= args.max_steps or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0: if (not torch.distributed.is_initialized() or (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0)): # Save a trained model logger.info( "** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_save_file = os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step)) torch.save( { 'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'files': [f_id] + files }, output_save_file) most_recent_ckpts_paths.append(output_save_file) if len(most_recent_ckpts_paths) > 3: ckpt_to_be_removed = most_recent_ckpts_paths.pop(0) os.remove(ckpt_to_be_removed) if global_step >= args.max_steps: tr_loss = tr_loss * args.gradient_accumulation_steps / training_steps if (torch.distributed.is_initialized()): tr_loss /= torch.distributed.get_world_size() torch.distributed.all_reduce(tr_loss) logger.info("Total Steps:{} Final Loss = {}".format( training_steps, tr_loss.item())) return del train_dataloader del train_sampler del train_data #for obj in gc.get_objects(): # if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # del obj torch.cuda.empty_cache() epoch += 1
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--google_pretrained", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--max_steps", default=-1.0, type=float, help="Total number of training steps to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=1, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--fp16', default=False, action='store_true', help="Mixed precision training") parser.add_argument('--amp', default=False, action='store_true', help="Mixed precision training") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument("--old", action='store_true', help="use old fp16 optimizer") parser.add_argument( '--vocab_file', type=str, default=None, required=True, help="Vocabulary mapping/file BERT was pretrainined on") parser.add_argument("--config_file", default=None, type=str, required=True, help="The BERT model config") args = parser.parse_args() args.fp16 = args.fp16 or args.amp if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir) and is_main_process(): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() #tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training model = modeling.BertForSequenceClassification(config, num_labels=num_labels) print("USING CHECKPOINT from", args.init_checkpoint) model.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu')["model"], strict=False) print("USED CHECKPOINT from", args.init_checkpoint) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: print("using fp16") try: from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False) if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic") else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale) scheduler = LinearWarmUpScheduler( optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps) else: print("using fp32") optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: print("data prep") cached_train_features_file = args.data_dir + '_{0}_{1}_{2}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.do_lower_case)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up for BERT which FusedAdam doesn't do scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 preds = None out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = label_ids.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = np.argmax(preds, axis=1) eval_loss = eval_loss / nb_eval_steps loss = tr_loss / nb_tr_steps if args.do_train else None results = { 'eval_loss': eval_loss, 'global_step': global_step, 'loss': loss } result = compute_metrics(task_name, preds, out_label_ids) results.update(result) print(results) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) writer.write("%s = %s\n" % (key, str(results[key])))
def main(args): args.data_dir = os.path.join(args.data_dir, args.task_name) args.output_dir = os.path.join(args.output_dir, args.task_name) logger.info("args = %s", args) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, "emo": EmoProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", "emo": "classification" } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') # device = torch.device('cpu') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: logger.info("Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): try: os.makedirs(args.output_dir) except: pass logger.info("catch a error") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) # tokenizer = BertTokenizer.from_pretrained(args.vocab_file, do_lower_case=args.do_lower_case) tokenizer = BertTokenizer.from_pretrained(args.bert_model) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) # use bert to aug train_examples ori_train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_dev_examples(args.data_dir) test_examples = processor.get_test_examples(args.data_dir) num_train_optimization_steps = int( len(ori_train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.use_saved == 1: bert_saved_dir = args.ckpt model = BertForNSPAug.from_pretrained(bert_saved_dir, cache_dir=args.ckpt_cache_dir, num_labels=num_labels, args=args) else: model = BertForNSPAug.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels, args=args) model.cuda() if n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 best_val_acc = 0.0 first_time = time.time() logger.info( "*********************************** Running training ***********************************" ) logger.info(" Num original examples = %d", len(ori_train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() aug_ratio = 0.0 # aug_ratio = 0.2 aug_seed = np.random.randint(0, 1000) for epoch in range(int(args.num_train_epochs)): logger.info("epoch=%d, aug_ratio = %f, aug_seed=%d", epoch, aug_ratio, aug_seed) train_examples = Aug_each_ckpt(ori_train_examples, label_list, model, tokenizer, args=args, num_show=args.num_show, output_mode=output_mode, seed=aug_seed, aug_ratio=aug_ratio, use_bert=False) if aug_ratio + args.aug_ratio_each < 1.0: aug_ratio += args.aug_ratio_each aug_seed += 1 train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, num_show=args.num_show, output_mode=output_mode, args=args) logger.info( "*********************************** Done convert features ***********************************" ) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=torch.float) token_real_label = torch.tensor( [f.token_real_label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, token_real_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) logger.info( "*********************************** begin training ***********************************" ) tr_loss, tr_seq_loss, tr_aug_loss, train_seq_accuracy, train_aug_accuracy = 0, 0, 0, 0, 0 nb_tr_examples, nb_tr_steps, nb_tr_tokens = 0, 0, 0 preds = [] all_labels = [] for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, input_mask, segment_ids, label_ids, token_real_label = batch seq_logits, aug_logits, aug_loss = model( input_ids, segment_ids, input_mask, labels=None, token_real_label=token_real_label) if output_mode == "classification": # if task_name == "emo": # loss_fct = # else: loss_fct = CrossEntropyLoss() seq_loss = loss_fct(seq_logits.view(-1, num_labels), label_ids.view(-1)) # print("[classification]label_ids: {}, size: {}".format(label_ids.view(-1), label_ids.view(-1).size())) # print("[classification]seq_logits size: {}".format(seq_logits.view(-1, num_labels).size())) elif output_mode == "regression": loss_fct = MSELoss() seq_loss = loss_fct(seq_logits.view(-1), label_ids.view(-1)) token_real_label = token_real_label.detach().cpu().numpy() w = args.aug_loss_weight loss = (1 - w) * seq_loss + w * aug_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() total_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), 10000.0) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 batch_loss = seq_loss.mean().item() tr_seq_loss += seq_loss.mean().item() seq_logits = seq_logits.detach().cpu().numpy() label_ids = label_ids.detach().cpu().numpy() if len(preds) == 0: preds.append(seq_logits) all_labels.append(label_ids) else: preds[0] = np.append(preds[0], seq_logits, axis=0) all_labels[0] = np.append(all_labels[0], label_ids, axis=0) aug_logits = aug_logits.detach().cpu().numpy() tmp_train_aug_accuracy, tmp_tokens = accuracy(aug_logits, token_real_label, type="aug") train_aug_accuracy += tmp_train_aug_accuracy nb_tr_tokens += tmp_tokens tr_aug_loss += aug_loss.mean().item() if global_step % 20 == 0: loss = tr_loss / nb_tr_steps seq_loss = tr_seq_loss / nb_tr_steps aug_loss = tr_aug_loss / nb_tr_steps tmp_pred = preds[0] tmp_labels = all_labels[0] if output_mode == "classification": tmp_pred = np.argmax(tmp_pred, axis=1) elif output_mode == "regression": tmp_pred = np.squeeze(tmp_pred) res = accuracy(tmp_pred, tmp_labels, task_name=task_name) if nb_tr_tokens != 0: aug_avg = train_aug_accuracy / nb_tr_tokens else: aug_avg = 0.0 log_string = "" log_string += "epoch={:<5d}".format(epoch) log_string += " step={:<9d}".format(global_step) log_string += " total_loss={:<9.7f}".format(loss) log_string += " seq_loss={:<9.7f}".format(seq_loss) log_string += " aug_loss={:<9.7f}".format(aug_loss) log_string += " batch_loss={:<9.7f}".format(batch_loss) log_string += " lr={:<9.7f}".format(optimizer.get_lr()[0]) log_string += " |g|={:<9.7f}".format(total_norm) #log_string += " tr_seq_acc={:<9.7f}".format(seq_avg) log_string += " tr_aug_acc={:<9.7f}".format(aug_avg) log_string += " mins={:<9.2f}".format( float(time.time() - first_time) / 60) for key in sorted(res.keys()): log_string += " " + key + "= " + str(res[key]) logger.info(log_string) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 train_loss = tr_loss / nb_tr_steps logger.info( "*********************************** training epoch done ***********************************" ) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and epoch % 1 == 0: tot_time = float(time.time() - first_time) / 60 eval_loss, eval_seq_loss, eval_aug_loss, eval_res, eval_aug_accuracy, res_parts=\ do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, eval_examples, type="dev") eval_res["tot_time"] = tot_time if "acc" in eval_res: tmp_acc = eval_res["acc"] elif "mcc" in eval_res: tmp_acc = eval_res["mcc"] else: tmp_acc = eval_res["corr"] result = { 'eval_total_loss': eval_loss, 'eval_seq_loss': eval_seq_loss, 'eval_aug_loss': eval_aug_loss, 'eval_aug_accuracy': eval_aug_accuracy, 'global_step': global_step, 'train_loss': train_loss, 'train_batch_size': args.train_batch_size, 'args': args } if tmp_acc >= best_val_acc: best_val_acc = tmp_acc dev_test = "dev" result.update({'best_epoch': epoch}) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_dir = os.path.join(args.output_dir, "dev_" + str(tmp_acc)) if not os.path.exists(output_model_dir): os.makedirs(output_model_dir) output_model_file = os.path.join(output_model_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(output_model_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) result.update(eval_res) result.update(res_parts) # output_eval_file = os.path.join(args.output_dir, # dev_test + "_results_" + str(tmp_acc) + ".txt") # with open(output_eval_file, "w") as writer: logger.info( "****************************** eval results ***********************************" ) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) else: result = { 'eval_total_loss': eval_loss, 'eval_seq_loss': eval_seq_loss, 'eval_aug_loss': eval_aug_loss, 'eval_aug_accuracy': eval_aug_accuracy, 'global_step': global_step, 'train_loss': train_loss, 'train_batch_size': args.train_batch_size, 'args': args } result.update(eval_res) result.update(res_parts) logger.info( "****************************** eval results ***********************************" ) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # write test results if args.do_test: # res_file = os.path.join(args.output_dir, # "test_" + str(tmp_acc)+".tsv") # idx, preds = do_test(args, label_list, task_name, processor, tokenizer, output_mode, model) # dataframe = pd.DataFrame({'index': range(idx), 'prediction': preds}) # dataframe.to_csv(res_file, index=False, sep='\t') # logger.info(" Num test length = %d", idx) logger.info( "*********************************** Running test ***********************************" ) logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) test_loss, test_seq_loss, test_aug_loss, test_res, test_aug_accuracy, res_parts=\ do_evaluate(args, processor, label_list, tokenizer, model, epoch, output_mode, num_labels, task_name, test_examples, type="test") result = { 'test_total_loss': test_loss, 'test_seq_loss': test_seq_loss, 'test_aug_loss': test_aug_loss, 'test_aug_accuracy': test_aug_accuracy, 'global_step': global_step, 'args': args } result.update(test_res) result.update(res_parts) logger.info( "****************************** test results ***********************************" ) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) logger.info( "*********************************** test done ***********************************" )
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The train file path") parser.add_argument("--eval_file", default=None, type=str, required=True, help="The dev file path") parser.add_argument("--eval_train_file", default=None, type=str, required=True, help="The train eval file path") parser.add_argument("--predict_file", default=None, type=str, required=False, help="The predict file path") parser.add_argument("--top_n", default=5, type=float, required=True, help="higher than threshold is classify 1,") parser.add_argument("--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--result_file", default=None, type=str, required=False, help="The result file that the BERT model was trained on.") parser.add_argument("--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written.") # Other parameters parser.add_argument("--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Whether to lower case the input text.") parser.add_argument("--max_seq_length", default=180, type=int, help="maximum total input sequence length after WordPiece tokenization.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--num_labels", default=1, type=int, help="mapping classify nums") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=6.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--reduce_dim", default=64, type=int, required=False, help="from hidden size to this dimensions, reduce dim") parser.add_argument("--gpu0_size", default=1, type=int, help="maximum total input sequence length after WordPiece tokenization.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumualte before") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=128, help='Loss scale, positive power of 2 can improve fp16 convergence.') args = parser.parse_args() data_processor = DataProcessor(args.num_labels) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info("16-bits training currently not supported in distributed training") args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) print(f'args.train_batch_size = {args.train_batch_size}') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not any([args.do_train, args.do_predict, args.do_eval]): raise ValueError("At least one of `do_train` or `do_eval` or `do_predict` must be True.") bert_config = BertConfig.from_json_file(args.bert_config_file) bert_config.reduce_dim = args.reduce_dim if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format( args.output_dir)) if args.do_train: os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) def prepare_data(args, task_name='train'): if task_name == 'train': file_path = args.train_file elif task_name == 'eval': file_path = args.eval_file elif task_name == 'train_eval': file_path = args.eval_train_file if os.path.isdir(file_path): examples = data_processor.read_file_dir(file_path, top_n=args.top_n) else: examples, example_map_ids = data_processor.read_novel_examples(file_path, top_n=args.top_n, task_name=task_name) features = convert_examples_to_features(examples, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_example_ids = torch.tensor([f.example_id for f in features], dtype=torch.long) if task_name in ['train', 'eval', 'train_eval']: all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) datas = TensorDataset(all_example_ids, all_input_ids, all_input_mask, all_segment_ids, all_label_ids) else: datas = TensorDataset(all_example_ids, all_input_ids, all_input_mask, all_segment_ids) if task_name == 'train': if args.local_rank == -1: data_sampler = RandomSampler(datas) else: data_sampler = DistributedSampler(datas) dataloader = DataLoader(datas, sampler=data_sampler, batch_size=args.train_batch_size, drop_last=True) else: dataloader = DataLoader(datas, batch_size=args.eval_batch_size, drop_last=True) return (dataloader, example_map_ids) if task_name != 'train' else dataloader def accuracy(example_ids, logits, labels, probs=None, positive=False): if positive: # print(f'example_ids = {example_ids.shape}') # print(f'logits = {logits.shape}') # print(f'labels = {labels.shape}') # print(f'probs = {probs.shape}') logits = logits[labels > 0] example_ids = example_ids[labels > 0] probs = probs[labels > 0] labels = labels[labels > 0] if isinstance(logits, torch.Tensor): logits = logits.tolist() if isinstance(example_ids, torch.Tensor): example_ids = example_ids.tolist() if isinstance(labels, torch.Tensor): labels = labels.tolist() assert len(logits) == len(example_ids) == len(labels) classify_name = ['part_same', 'full_same'] if positive else ['dif', 'same'] text_a, text_b, novel_names, persons = [], [], [], [] for i in example_ids: example = example_map_ids[i] # labels.append(example.label) text_a.append("||".join(example.text_a)) text_b.append("||".join(example.text_b)) novel_names.append(example.name) persons.append(example.person) write_data = pd.DataFrame({ "text_a": text_a, "text_b": text_b, "labels": labels, "logits": logits, "novel_names": novel_names, "persons": persons }) write_data['yes_or_no'] = write_data['labels'] == write_data['logits'] if probs is not None: if isinstance(probs, torch.Tensor): probs = probs.tolist() write_data['logits'] = probs # write_data.to_csv(os.path.join(args.output_dir, f'{positive}.csv'), index=False) assert len(labels) == len(logits) try: result = classification_report(labels, logits, target_names=classify_name) except Exception: result = 'label is not equal to 3' print(f'\n{result}') return result def eval_model(model, eval_dataloader, device): model.eval() eval_loss = 0 all_first_logits, all_second_logits = [], [] all_example_ids = [] all_labels = [] all_first_probs, all_sencond_probs = [], [] for step, batch in enumerate(tqdm(eval_dataloader, desc="evaluating")): example_ids, input_ids, input_mask, segment_ids, label_ids = batch if not args.do_train and not args.do_eval: label_ids = None with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, labels=label_ids) first_logits, second_logits = logits first_prob, first_logits = torch.max(logits[0], dim=1) second_prob, second_logits = torch.max(logits[1], dim=1) all_labels.append(label_ids) all_first_probs.append(first_prob) all_sencond_probs.append(second_prob) all_first_logits.append(first_logits) all_second_logits.append(second_logits) all_example_ids.append(example_ids) eval_loss += tmp_eval_loss.mean().item() all_first_logits = torch.cat(all_first_logits, dim=0) all_second_logits = torch.cat(all_second_logits, dim=0) all_first_probs = torch.cat(all_first_probs, dim=0) all_sencond_probs = torch.cat(all_sencond_probs, dim=0) all_labels = torch.cat(all_labels, dim=0) all_first_labels, all_second_labels = [ label.view(-1) for label in torch.chunk(all_labels, dim=1, chunks=2) ] all_example_ids = torch.cat(all_example_ids, dim=0) accuracy(all_example_ids, all_first_logits, labels=all_first_labels, probs=all_first_probs, positive=False) accuracy(all_second_logits, all_second_logits, labels=all_second_labels, probs=all_sencond_probs, positive=True) eval_loss /= (step + 1) return eval_loss train_dataloader = None num_train_steps = None if args.do_train: train_dataloader = prepare_data(args, task_name='train') num_train_steps = int( len(train_dataloader) / args.gradient_accumulation_steps * args.num_train_epochs) model = ThreeCategoriesClassifier2(bert_config, num_labels=data_processor.num_labels) new_state_dict = model.state_dict() init_state_dict = torch.load(os.path.join(args.bert_model, 'pytorch_model.bin')) for k, v in init_state_dict.items(): if k in new_state_dict: print(f'k in = {k} v in shape = {v.shape}') new_state_dict[k] = v model.load_state_dict(new_state_dict) if args.fp16: model.half() if args.do_predict or args.do_eval: model_path = os.path.join(args.output_dir, WEIGHTS_NAME) new_state_dict = torch.load(model_path) new_state_dict = dict([ (k[7:], v) if k.startswith('module') else (k, v) for k, v in new_state_dict.items() ]) model.load_state_dict(new_state_dict) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: if args.gpu0_size > 0: model = BalancedDataParallel(args.gpu0_size, model, dim=0).to(device) else: model = torch.nn.DataParallel(model) if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay': 0.0 }] eval_dataloader, example_map_ids = prepare_data(args, task_name='eval') if args.do_train: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) # eval_loss = eval_model(model, eval_dataloader, device) # logger.info(f'初始开发集loss: {eval_loss}') for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.train() torch.cuda.empty_cache() model_save_path = os.path.join(args.output_dir, f"{WEIGHTS_NAME}.{epoch}") tr_loss = 0 train_batch_count = 0 for step, batch in enumerate(tqdm(train_dataloader, desc="training")): _, input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() if args.fp16 and args.loss_scale != 1.0: loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() train_batch_count += 1 tr_loss /= train_batch_count eval_loss = eval_model(model, eval_dataloader, device) logger.info( f'训练loss: {tr_loss}, 开发集loss:{eval_loss} 训练轮数:{epoch + 1}/{int(args.num_train_epochs)}' ) model_to_save = model.module if hasattr(model, 'module') else model torch.save(model.state_dict(), model_save_path) if epoch == 0: model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) elif args.do_eval: eval_model(model, eval_dataloader, device) if args.do_predict: eval_model(model, eval_dataloader, device)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "commonsenseqa": CommonsenseQaProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, "commonsenseqa":4, } # if args.local_rank == -1 or args.no_cuda: # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() # else: # torch.cuda.set_device(args.local_rank) # device = torch.device("cuda", args.local_rank) # n_gpu = 1 # # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') device = "cuda:2" n_gpu = 1 logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) print("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) print("current task is " + str(task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model # model = BertForSequenceClassification.from_pretrained(args.bert_model, # cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( # args.local_rank), # num_labels=num_labels) model = BertForMultipleChoice.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'. format(args.local_rank), num_choices=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 nb_tr_steps = 0 tr_loss = 0 best_eval_accuracy = 0.0 if args.do_train: train_features = convert_examples_to_features_mc( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, logits = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features_mc( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) # logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_accuracy = eval_accuracy / nb_eval_examples print("the current eval accuracy is: " + str(eval_accuracy)) if eval_accuracy > best_eval_accuracy: best_eval_accuracy = eval_accuracy if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) model.train() # # Save a trained model # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") # if args.do_train: # torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = BertForMultipleChoice.from_pretrained(args.bert_model, state_dict=model_state_dict, num_choices=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features_mc( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 all_pred_labels = [] all_anno_labels = [] all_logits = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) # logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() output_labels = np.argmax(logits, axis=1) all_pred_labels.extend(output_labels.tolist()) all_logits.extend(list(logits)) all_anno_labels.extend(list(label_ids)) tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'best_eval_accuracy': best_eval_accuracy, 'global_step': global_step, 'loss': loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) for i in range(len(all_pred_labels)): writer.write(str(i) + "\t" + str(all_anno_labels[i]) + "\t" + str(all_pred_labels[i]) + "\t" + str(all_logits[i]) + "\n")
def main(): # args = parse_arguments() # del args.local_rank # print(args) # args_to_yaml(args, 'config_finetune_train_glue_mrpc.yaml') # exit(0) config_yaml, local_rank = parse_my_arguments() args = args_from_yaml(config_yaml) args.local_rank = local_rank """ Experiment Setup """ if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, } task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) """ Prepare Model """ # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) state_dict = torch.load(args.init_checkpoint, map_location='cpu') state_dict = state_dict.get( 'model', state_dict ) # in a full checkpoint weights are saved in state_dict['model'] model.load_state_dict(state_dict, strict=False) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) plain_model = getattr(model, 'module', model) with open(args.sparsity_config, 'r') as f: raw_dict = yaml.load(f, Loader=yaml.SafeLoader) masks = dict.fromkeys(raw_dict['prune_ratios'].keys()) for param_name in list(masks.keys()): if get_parameter_by_name(plain_model, param_name) is None: print(f'[WARNING] Cannot find {param_name}') del masks[param_name] for param_name in masks: param = get_parameter_by_name(plain_model, param_name) non_zero_mask = torch.ne(param, 0).to(param.dtype) masks[param_name] = non_zero_mask """ Prepare Optimizer""" # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.fp16_utils.fp16_optimizer import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: """ Prepare Dataset """ train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) """ Training Loop """ model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 plain_model = getattr(model, 'module', model) for param_name, mask in masks.items(): get_parameter_by_name(plain_model, param_name).data *= mask """ Load Model for Evaluation """ if args.do_train: # Save a trained model and the associated configuration output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) if is_main_process( ): # only the main process should save the trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) state_dict = torch.load(args.init_checkpoint, map_location='cpu') state_dict = state_dict.get('model', state_dict) model.load_state_dict(state_dict, strict=False) model.to(device) """ Run Evaluation """ if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_file", default=None, type=str, required=True, help="The train file path") parser.add_argument("--eval_file", default=None, type=str, required=True, help="The dev file path") parser.add_argument("--predict_file", default=None, type=str, required=False, help="The predict file path") parser.add_argument("--predict_result_file", default='datas/result.csv', type=str, required=False, help="The predict result file path") parser.add_argument( "--bert_model", default=None, type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=250, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--load_checkpoint", default=False, action='store_true', help="Whether to run load checkpoint.") parser.add_argument("--num_labels", default=1, type=int, help="mapping classify nums") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--epoches", default=6, type=int, help="Total epoch numbers for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=6.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() vocab_path = os.path.join(args.bert_model, VOCAB_NAME) # bert_config = BertConfig.from_json_file(vocab_path) data_processor = DataProcessor() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # if args.do_train: # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory ({}) already exists and is not empty.".format( # args.output_dir)) # else: # os.makedirs(args.output_dir, exist_ok=True) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=args.do_lower_case) model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=3) for k, v in model.state_dict().items(): print(f'k = {k}, v.grad = {v.grad}') model.to(device) # model = torch.nn.DataParallel(model) def evaluating(model, eval_dataloader): model.eval() eval_loss = 0 logits, labels = [], [] for step, batch in enumerate(eval_dataloader): input_ids, input_mask, segment_ids, label_ids = [ b.to(device) for b in batch ] with torch.no_grad(): loss, logit = model(input_ids, segment_ids, input_mask, label_ids) loss = loss.mean() eval_loss = loss * args.gradient_accumulation_steps if step == 0 else eval_loss + loss * args.gradient_accumulation_steps logit = torch.argmax(logit, dim=-1) logits.extend(logit.tolist()) labels.extend(label_ids.tolist()) return (eval_loss.item() / step, logits, labels) def predicting(model, dataloader): model.eval() logits, example_ids = [], [] for step, batch in enumerate(dataloader): if step % 100 == 0: print(f'当前预测进度: {step}/{len(dataloader)}') input_ids, input_mask, segment_ids, label_ids = [ b.to(device) for b in batch ] with torch.no_grad(): logit = model(input_ids, segment_ids, input_mask) logit = torch.argmax(logit, dim=-1) logits.extend(logit.tolist()) example_ids.extend(label_ids.tolist()) return logits, example_ids def eval_meric(model, data_loader): eval_loss, all_logits, all_labels = evaluating(model, data_loader) accuracy(all_labels, all_logits) logger.info(f'Average eval loss = {eval_loss}') return eval_loss def write_predict_file(model, data_loader, file_path): """ 写入预测文件: 格式:'五彩滨云-final.csv' """ logits, ids = predicting(model, data_loader) assert len(ids) == len(logits) logger.info( f'zero nums {logits.count(0)}, one nums {logits.count(1)}, two nums {logits.count(2)}' ) labels = [ data_processor.eval_dict[id][1] for id, logit in zip(ids, logits) ] # if not args.do_eval: # logits = [i - 1 for i in logits] # data_df = pd.DataFrame({'id': ids, 'y': logits}) # else: assert len(labels) == len(logits) # accuracy(labels, logits) passages = [ data_processor.eval_dict[id][0] for id, logit in zip(ids, logits) ] autors = [ data_processor.eval_dict[id][2] for id, logit in zip(ids, logits) ] like_counts = [ data_processor.eval_dict[id][3] for id, logit in zip(ids, logits) ] times = [ data_processor.eval_dict[id][4] for id, logit in zip(ids, logits) ] assert len(labels) == len(passages) match_array = np.array((logits)) == np.array(labels) match_list = match_array.tolist() data_df = pd.DataFrame({ 'id': ids, 'pred': logits, 'time': times, 'match': '', 'autors': autors, 'like_counts': like_counts, 'passage': passages }) data_df.to_csv(file_path, index=None) eval_examples = data_processor.get_examples(args.eval_file, data_type='eval') eval_features = convert_examples_to_features(args, eval_examples, args.max_seq_length, tokenizer) eval_loader = ParaDataloader(eval_features) eval_loader = DataLoader(eval_loader, shuffle=False, batch_size=args.eval_batch_size) if 0: # 数据读取 train_examples = data_processor.get_examples(args.train_file, data_type='train') # 特征转换 train_features = convert_examples_to_features(args, train_examples, args.max_seq_length, tokenizer) num_train_steps = int( len(train_features) // args.train_batch_size // args.gradient_accumulation_steps * args.num_train_epochs) # 数据loader train_loader = ParaDataloader(train_features) # 数据并行loader输入格式 train_loader = DataLoader(train_loader, shuffle=True, batch_size=args.train_batch_size) model.zero_grad() no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) tr_loss = None for epoch in range(args.epoches): model.train() min_eval_loss = 10000 for step, batch in enumerate(train_loader): input_ids, input_mask, segment_ids, label_ids = [ b.to(device) for b in batch ] loss, _ = model(input_ids, segment_ids, input_mask, label_ids) loss = loss.mean() print(f'loss = {loss}') if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss = loss * args.gradient_accumulation_steps if step == 0 else tr_loss + loss * args.gradient_accumulation_steps optimizer.step() optimizer.zero_grad() if step % 1000 == 1: eval_loss = eval_meric(model, eval_loader) if eval_loss < min_eval_loss: save_checkpoint(model, epoch, args.output_dir) if args.do_predict: if args.load_checkpoint: state_dict = torch.load('output/pytorch_model-0004.bin') model.load_state_dict(state_dict) logger.info(f'Start to predict......') if args.do_eval: predict_examples = data_processor.get_eval_examples(args.eval_file) else: predict_examples = data_processor.get_predict_examples( args.predict_file) predict_features = convert_examples_to_features( args, predict_examples, args.max_seq_length, tokenizer) predict_loader = ParaDataloader(predict_features) predict_loader = DataLoader(predict_loader, shuffle=False, batch_size=args.eval_batch_size) write_predict_file(model, predict_loader, args.predict_result_file)