def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--bert_model_path", default="", type=str, required=False, help="Bert pretrained saved pytorch model path.") parser.add_argument("--experiment", default="attention", type=str, required=False, help="4 types: attention, base, long, ablation. " "base: original bert" "long: uses an lstm to keep track of all bert hidden representations, but backprop over the first" "attention: uses an lstm + attention mechanism to backprop over more than the first representation" "ablation: concat all the hidden representations" ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--seq_segments", default=8, type=int, help="The number of sequence steps") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_shuffle", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--super_debug", action='store_true', help="hack for debugging.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) cls_token = tokenizer.convert_tokens_to_ids(["[CLS]"]) sep_token = tokenizer.convert_tokens_to_ids(["[SEP]"]) '''if args.super_debug: cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name), str(args.seq_segments))) logger.info("Loading test dataset") eval_data = load_dataset(cached_eval_features_file, args, processor, tokenizer, output_mode, train = False) exit()''' #model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels = num_labels) #model = MyBertForMultiLabelSequenceClassification.from_pretrained(args.bert_model, num_labels = num_labels) model = get_model(args, num_labels) if args.bert_model_path != "": print("Loading model from: " + args.bert_model_path) if args.do_train: pretrained_dict = torch.load(args.bert_model_path) model_dict = model.state_dict() # 1. filter out unnecessary keys pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} '''if 'classifier.weight' in pretrained_dict and pretrained_dict['classifier.weight'].shape[0] != num_labels: del pretrained_dict['classifier.weight'] del pretrained_dict['classifier.bias'] if 'classifier2.weight' in pretrained_dict and pretrained_dict['classifier2.weight'].shape[0] != num_labels: del pretrained_dict['classifier2.weight'] del pretrained_dict['classifier2.bias']''' # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) else: model.load_state_dict(torch.load(args.bert_model_path)) sig = Sigmoid() if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 loss_fct = CrossEntropyLoss() if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name), str(args.seq_segments))) # Prepare data loader logger.info("Loading training dataset") train_data = load_dataset(cached_train_features_file, args, processor, tokenizer, output_mode) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = (len(train_dataloader)) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") #logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for i in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, t_batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): input_ids, input_mask, segment_ids, label_ids = t_batch if args.do_shuffle: shuffled_index = torch.randperm(input_ids.shape[0]) shuffled_ids = input_ids[shuffled_index][:,:256] shuffled_mask = input_mask[shuffled_index][:,:256] shuffled_seg = segment_ids[shuffled_index][:,:256] input_ids[:,:256] = shuffled_ids input_mask[:,:256] = shuffled_mask segment_ids[:,:256] = shuffled_seg input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) logits = model(input_ids, segment_ids, input_mask) loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: acc = np.sum(np.argmax(logits.cpu().detach().numpy(), axis=1) == label_ids.cpu().numpy()) / label_ids.shape[0] tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) tb_writer.add_scalar('acc', acc, global_step) ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() ### Example: output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned #model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) open(os.path.join(args.output_dir, 'experiment_{}.txt'.format(args.experiment)), 'a').close() else: model = get_model(args, num_labels) model.load_state_dict(torch.load(output_model_file)) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) ### Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name), str(args.seq_segments))) logger.info("Loading test dataset") eval_data = load_dataset(cached_eval_features_file, args, processor, tokenizer, output_mode, train = False) #import pdb; pdb.set_trace() logger.info("***** Running evaluation *****") #logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if output_mode == "multi_classification": logits = sig(logits) if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) elif output_mode == "multi_classification": preds = preds > .5 result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss/global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results_final.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForQuestionAnswering.from_pretrained(args.bert_model) if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # if args.local_rank != -1: # num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) else: model = BertForQuestionAnswering.from_pretrained(args.bert_model) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--vocab_file", default='bert-base-uncased-vocab.txt', type=str, required=True) parser.add_argument("--model_file", default='bert-base-uncased.tar.gz', type=str, required=True) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument( "--predict_dir", default=None, type=str, required=True, help="The output directory where the predictions will be written.") # Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument("--test_file", default=None, type=str) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=2.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--view_id', type=int, default=1, help="view id of multi-view co-training(two-view)") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--save_all', default=False, action='store_true') # Base setting parser.add_argument('--pretrain', type=str, default=None) parser.add_argument('--max_ctx', type=int, default=2) parser.add_argument('--task_name', type=str, default='race') parser.add_argument('--bert_name', type=str, default='pool-race') parser.add_argument('--reader_name', type=str, default='race') parser.add_argument('--per_eval_step', type=int, default=10000000) # model parameters parser.add_argument('--evidence_lambda', type=float, default=0.8) # Parameters for running labeling model parser.add_argument('--do_label', default=False, action='store_true') parser.add_argument('--sentence_id_file', nargs='*') parser.add_argument('--weight_threshold', type=float, default=0.0) parser.add_argument('--only_correct', default=False, action='store_true') parser.add_argument('--label_threshold', type=float, default=0.0) parser.add_argument('--multi_evidence', default=False, action='store_true') parser.add_argument('--metric', default='accuracy', type=str) parser.add_argument('--num_evidence', default=1, type=int) parser.add_argument('--power_length', default=1., type=float) parser.add_argument('--num_choices', default=4, type=int) parser.add_argument('--split_type', default=0, type=int) parser.add_argument('--use_gumbel', default=False, action='store_true') parser.add_argument('--sample_steps', type=int, default=10) parser.add_argument('--reward_func', type=int, default=0) parser.add_argument('--freeze_bert', default=False, action='store_true') args = parser.parse_args() logger = setting_logger(args.output_dir) logger.info('================== Program start. ========================') logger.info( f'================== Running with seed {args.seed} ==========================' ) model_params = prepare_model_params(args) read_params = prepare_read_params(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict and not args.do_label: raise ValueError( "At least one of `do_train` or `do_predict` or `do_label` must be True." ) if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if args.do_train: if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) if args.do_predict or args.do_label: os.makedirs(args.predict_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.vocab_file) data_reader = initialize_reader(args.reader_name) num_train_steps = None if args.do_train or args.do_label: train_examples = data_reader.read(input_file=args.train_file, **read_params) cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}_{5}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), str(args.max_ctx), str(args.task_name)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except FileNotFoundError: train_features = data_reader.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.pretrain is not None: logger.info('Load pretrained model from {}'.format(args.pretrain)) model_state_dict = torch.load(args.pretrain, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) else: model = initialize_model(args.bert_name, args.model_file, **model_params) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # Remove frozen parameters param_optimizer = [n for n in param_optimizer if n[1].requires_grad] # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if num_train_steps is not None else -1 if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=t_total) logger.info( f"warm up linear: warmup = {warmup_linear.warmup}, t_total = {warmup_linear.t_total}." ) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) # Prepare data eval_examples = data_reader.read(input_file=args.predict_file, **read_params) eval_features = data_reader.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length) eval_tensors = data_reader.data_to_tensors(eval_features) eval_data = TensorDataset(*eval_tensors) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) if args.do_train: if args.do_label: logger.info('Training in State Wise.') sentence_label_file = args.sentence_id_file if sentence_label_file is not None: for file in sentence_label_file: train_features = data_reader.generate_features_sentence_ids( train_features, file) else: logger.info('No sentence id supervision is found.') else: logger.info('Training in traditional way.') logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Num train total optimization steps = %d", t_total) logger.info(" Batch size = %d", args.predict_batch_size) train_loss = AverageMeter() best_acc = 0.0 best_loss = 1000000 summary_writer = SummaryWriter(log_dir=args.output_dir) global_step = 0 eval_loss = AverageMeter() eval_accuracy = CategoricalAccuracy() eval_epoch = 0 train_tensors = data_reader.data_to_tensors(train_features) train_data = TensorDataset(*train_tensors) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for epoch in range(int(args.num_train_epochs)): logger.info(f'Running at Epoch {epoch}') # Train for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", dynamic_ncols=True)): model.train() if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, train_features, model_state=ModelState.Train) model_output = model(**inputs) loss = model_output['loss'] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically if args.fp16: lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step summary_writer.add_scalar('lr', lr_this_step, global_step) else: summary_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) optimizer.step() optimizer.zero_grad() global_step += 1 train_loss.update(loss.item(), 1) summary_writer.add_scalar('train_loss', train_loss.avg, global_step) # logger.info(f'Train loss: {train_loss.avg}') if (step + 1) % args.per_eval_step == 0 or step == len( train_dataloader) - 1: # Evaluation model.eval() logger.info("Start evaluating") for _, eval_batch in enumerate( tqdm(eval_dataloader, desc="Evaluating", dynamic_ncols=True)): if n_gpu == 1: eval_batch = batch_to_device( eval_batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( eval_batch, eval_features, model_state=ModelState.Evaluate) with torch.no_grad(): output_dict = model(**inputs) loss, choice_logits = output_dict[ 'loss'], output_dict['choice_logits'] eval_loss.update(loss.item(), 1) eval_accuracy(choice_logits, inputs["labels"]) eval_epoch_loss = eval_loss.avg summary_writer.add_scalar('eval_loss', eval_epoch_loss, eval_epoch) eval_loss.reset() current_acc = eval_accuracy.get_metric(reset=True) summary_writer.add_scalar('eval_acc', current_acc, eval_epoch) torch.cuda.empty_cache() if args.save_all: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, f"pytorch_model_{eval_epoch}.bin") torch.save(model_to_save.state_dict(), output_model_file) if current_acc > best_acc: best_acc = current_acc model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) if eval_epoch_loss < best_loss: best_loss = eval_epoch_loss model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "pytorch_loss_model.bin") torch.save(model_to_save.state_dict(), output_model_file) logger.info( 'Eval Epoch: %d, Accuracy: %.4f (Best Accuracy: %.4f)' % (eval_epoch, current_acc, best_acc)) eval_epoch += 1 logger.info( f'Epoch {epoch}: Accuracy: {best_acc}, Train Loss: {train_loss.avg}' ) summary_writer.close() for output_model_name in ["pytorch_model.bin", "pytorch_loss_model.bin"]: # Loading trained model output_model_file = os.path.join(args.output_dir, output_model_name) model_state_dict = torch.load(output_model_file, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) model.to(device) # Write Yes/No predictions if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = data_reader.read(args.test_file) test_features = data_reader.convert_examples_to_features( test_examples, tokenizer, args.max_seq_length) test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] test_acc = CategoricalAccuracy() logger.info("Start predicting yes/no on Dev set.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, test_features, model_state=ModelState.Evaluate) with torch.no_grad(): batch_choice_logits = model(**inputs)['choice_logits'] test_acc(batch_choice_logits, inputs['labels']) example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu( ).tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) if "loss" in output_model_name: logger.info( 'Predicting question choice on test set using model with lowest loss on validation set.' ) output_prediction_file = os.path.join(args.predict_dir, 'loss_predictions.json') else: logger.info( 'Predicting question choice on test set using model with best accuracy on validation set,' ) output_prediction_file = os.path.join(args.predict_dir, 'predictions.json') data_reader.write_predictions(test_examples, test_features, all_results, output_prediction_file) logger.info( f"Accuracy on Test set: {test_acc.get_metric(reset=True)}") # Loading trained model. if args.metric == 'accuracy': logger.info("Load model with best accuracy on validation set.") output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") elif args.metric == 'loss': logger.info("Load model with lowest loss on validation set.") output_model_file = os.path.join(args.output_dir, "pytorch_loss_model.bin") else: raise RuntimeError( f"Wrong metric type for {args.metric}, which must be in ['accuracy', 'loss']." ) model_state_dict = torch.load(output_model_file, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) model.to(device) # Labeling sentence id. if args.do_label and (args.local_rank == -1 or torch.distributed.get_rank() == 0): f = open('debug_log.txt', 'w') def softmax(x): """Compute softmax values for each sets of scores in x.""" e_x = np.exp(x - np.max(x)) return e_x / e_x.sum() def topk(sentence_sim): """ :param sentence_sim: numpy :return: """ max_length = min(args.num_evidence, len(sentence_sim)) sorted_scores = np.array(sorted(sentence_sim, reverse=True)) scores = [] for idx in range(max_length): scores.append(np.log(softmax(sorted_scores[idx:])[0])) scores = [np.mean(scores[:(j + 1)]) for j in range(max_length)] top_k = int(np.argmax(scores) + 1) sorted_scores = sorted(enumerate(sentence_sim), key=lambda x: x[1], reverse=True) evidence_ids = [x[0] for x in sorted_scores[:top_k]] sentence = { 'sentences': evidence_ids, 'value': float(np.exp(scores[top_k - 1])) } return sentence def batch_topk(sentence_sim, sentence_mask): batch_size = sentence_sim.size(0) num_choices = sentence_sim.size(1) sentence_sim = sentence_sim.numpy() + 1e-15 sentence_mask = sentence_mask.numpy() sentence_ids = [] for b in range(batch_size): choice_sentence_ids = [ topk(_sim[:int(sum(_mask))]) for _sim, _mask in zip(sentence_sim[b], sentence_mask[b]) ] assert len(choice_sentence_ids) == num_choices sentence_ids.append(choice_sentence_ids) return sentence_ids test_examples = train_examples test_features = train_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running labeling *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start labeling.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device(batch, device) inputs = data_reader.generate_inputs(batch, test_features, model_state=ModelState.Test) with torch.no_grad(): output_dict = model(**inputs) batch_choice_logits, batch_sentence_logits = output_dict[ "choice_logits"], output_dict["sentence_logits"] batch_sentence_mask = output_dict["sentence_mask"] example_indices = batch[-1] # batch_beam_results = batch_choice_beam_search(batch_sentence_logits, batch_sentence_mask) batch_topk_results = batch_topk(batch_sentence_logits, batch_sentence_mask) for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu() evidence_list = batch_topk_results[i] test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( RawOutput(unique_id=unique_id, model_output={ "choice_logits": choice_logits, "evidence_list": evidence_list })) output_prediction_file = os.path.join(args.predict_dir, 'sentence_id_file.json') data_reader.predict_sentence_ids( test_examples, test_features, all_results, output_prediction_file, weight_threshold=args.weight_threshold, only_correct=args.only_correct, label_threshold=args.label_threshold)
def BertSquad(file="", mode='predict', bert_model="bert-base-uncased", output='./output'): parser = {} parser["bert_model"] = bert_model parser["output_dir"] = output parser["train_file"] = file parser["predict_file"] = file parser["max_seq_length"] = 384 parser["doc_stride"] = 128 parser["max_query_length"] = 64 parser["do_train"] = mode == 'train' parser["do_predict"] = mode == 'predict' parser["train_batch_size"] = 32 parser["predict_batch_size"] = 8 parser["learning_rate"] = 5e-5 parser["num_train_epochs"] = 3.0 parser["warmup_proportion"] = 0.1 parser["n_best_size"] = 20 parser["max_answer_length"] = 30 parser["verbose_logging"] = False parser["no_cuda"] = False parser['seed'] = 42 parser['gradient_accumulation_steps'] = 1 parser["do_lower_case"] = ('uncased' in bert_model) parser["local_rank"] = -1 parser['fp16'] = False parser['overwrite_output_dir'] = False parser['loss_scale'] = 0 parser['version_2_with_negative'] = False parser['null_score_diff_threshold'] = 0.0 parser['server_ip'] = '' parser['server_port'] = '' args = AttrDict.AttrDict(parser) print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = BertForQuestionAnswering.from_pretrained(args.bert_model) if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # if args.local_rank != -1: # num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) else: # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir.") parser.add_argument("--pretrain_dir", default=None, type=str, required=True, help="The pretrained para on SST-phrase.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--para", default="sentibert", type=str, help="The choice of pre-trained parameters (BERT ot SentiBERT).") parser.add_argument("--domain", default="joy", type=str, help="The domain for EmoInt.") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=30, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) if task_name == "emoint": CONFIG_NAME_SAVE = args.domain + ".json" WEIGHTS_NAME_SAVE = args.domain + ".bin" else: CONFIG_NAME_SAVE = task_name + ".json" WEIGHTS_NAME_SAVE = task_name + ".bin" if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) if task_name == "sstphrase" or task_name == "sst-3": if args.para == "sentibert": model = BertForPhraseClassification.from_pretrained( args.pretrain_dir, num_labels=num_labels) else: model = BertForPhraseClassification.from_pretrained( args.bert_model, num_labels=num_labels) else: if args.para == "sentibert": model = BertForSequenceClassification.from_pretrained( args.pretrain_dir, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() cached_train_features_file = os.path.join( args.data_dir, 'train_{0}_{1}_{2}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name))) # Prepare data loader if False: logger.info(" Reading train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "rb") as writer: train_features = pickle.load(writer) else: if task_name == "emoint": train_examples = processor.get_train_examples( args.data_dir, args.domain, args.para) else: train_examples = processor.get_train_examples( args.data_dir, args.para) ## if you want to save for cache, please use this part ## # cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format( # list(filter(None, args.bert_model.split('/'))).pop(), # str(args.max_seq_length), # str(task_name))) # try: # with open(cached_train_features_file, "rb") as reader: # train_features = pickle.load(reader) if task_name == "sstphrase" or task_name == "sst-3": train_features = convert_examples_to_features_phrase( train_examples, args.max_seq_length, tokenizer, output_mode, "train", args.data_dir) else: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode, args.para) ## if you want to use the cached file, please use this part ## # if args.local_rank == -1 or torch.distributed.get_rank() == 0: # logger.info(" Saving train features into cached file %s", cached_train_features_file) # with open(cached_train_features_file, "wb") as writer: # pickle.dump(train_features, writer)''' all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if train_features[0].span is None: all_span = None all_span_3 = None else: all_span = torch.tensor([f.span for f in train_features], dtype=torch.long) all_span_3 = torch.tensor([f.span_3 for f in train_features], dtype=torch.long) if task_name == "sstphrase" or task_name == "sst-3": all_phrase_mask = torch.tensor( [f.phrase_mask for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) if task_name == "sstphrase" or task_name == "sst-3": train_data = TensorDataset(all_input_ids, all_input_mask, all_phrase_mask, all_segment_ids, all_label_ids, all_span, all_span_3) else: if (task_name == "sst-2" or task_name == "twitter" or task_name == "emocontext" or task_name == "emoint") and args.para == "sentibert": train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_span, all_span_3) else: train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): batch = tuple(t.to(device) for t in batch) if task_name == "sstphrase" or task_name == "sst-3": input_ids, input_mask, phrase_mask, segment_ids, label_ids, span, span_3 = batch logits, loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, phrase_mask=phrase_mask, graph_label=label_ids, span=span, span_3=span_3) else: if (task_name == "sst-2" or task_name == "twitter" or task_name == "emocontext" or task_name == "emoint") and args.para == "sentibert": input_ids, input_mask, segment_ids, label_ids, span, span_3 = batch logits, loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, span=span, span_3=span_3) else: input_ids, input_mask, segment_ids, label_ids = batch logits, loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if output_mode == "classification": loss_fct = CrossEntropyLoss(ignore_index=-1) loss_tmp = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) if task_name == "sstphrase" or task_name == "sst-3" or task_name == "twitter" or task_name == "emocontext" or task_name == "emoint": print('loss', loss_tmp) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() ### Example: tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) finetuned_model_name = [WEIGHTS_NAME_SAVE, CONFIG_NAME_SAVE] if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME_SAVE) output_config_file = os.path.join(args.output_dir, CONFIG_NAME_SAVE) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned if task_name == "sstphrase" or task_name == "sst-3": model = BertForPhraseClassification.from_pretrained( args.output_dir, finetuned_model_name, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.output_dir, finetuned_model_name, num_labels=num_labels) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) else: if task_name == "sstphrase" or task_name == "sst-3": model = BertForPhraseClassification.from_pretrained( args.output_dir, finetuned_model_name, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.output_dir, finetuned_model_name, num_labels=num_labels) model.to(device) ### Evaluation ### cached_eval_features_file = os.path.join( args.data_dir, 'dev_{0}_{1}_{2}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name))) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if False: logger.info(" Reading eval features into cached file %s", cached_eval_features_file) with open(cached_train_features_file, "rb") as writer: eval_features = pickle.load(writer) else: if task_name == "emoint": eval_examples = processor.get_dev_examples( args.data_dir, args.domain, args.para) else: eval_examples = processor.get_dev_examples( args.data_dir, args.para) # cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format( # list(filter(None, args.bert_model.split('/'))).pop(), # str(args.max_seq_length), # str(task_name))) # try: # with open(cached_eval_features_file, "rb") as reader: # eval_features = pickle.load(reader) if task_name == "sstphrase" or task_name == "sst-3": eval_features = convert_examples_to_features_phrase( eval_examples, args.max_seq_length, tokenizer, output_mode, "dev", args.data_dir) else: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode, args.para) # eval_features = convert_examples_to_features( # eval_examples, label_list, args.max_seq_length, tokenizer, output_mode, graph) # if args.local_rank == -1 or torch.distributed.get_rank() == 0: # logger.info(" Saving eval features into cached file %s", cached_eval_features_file) # with open(cached_eval_features_file, "wb") as writer: # pickle.dump(eval_features, writer)''' logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if eval_features[0].span is None: all_span = None all_span_3 = None else: all_span = torch.tensor([f.span for f in eval_features], dtype=torch.long) all_span_3 = torch.tensor([f.span_3 for f in eval_features], dtype=torch.long) if task_name == "sstphrase" or task_name == "sst-3": all_phrase_mask = torch.tensor( [f.phrase_mask for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) if task_name == "sstphrase" or task_name == "sst-3": eval_data = TensorDataset(all_input_ids, all_input_mask, all_phrase_mask, all_segment_ids, all_label_ids, all_span, all_span_3) else: if (task_name == "sst-2" or task_name == "twitter" or task_name == "emocontext" or task_name == "emoint") and args.para == "sentibert": eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_span, all_span_3) else: eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler( eval_data) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None if task_name == "sstphrase" or task_name == "sst-3": for input_ids, input_mask, phrase_mask, segment_ids, label_ids, span, span_3 in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) phrase_mask = phrase_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) span = span.to(device) span_3 = span_3.to(device) with torch.no_grad(): logits, loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, phrase_mask=phrase_mask, graph_label=label_ids, span=span, span_3=span_3) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss(ignore_index=-1) tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) else: if (task_name == "sst-2" or task_name == "twitter" or task_name == "emocontext" or task_name == "emoint") and args.para == "sentibert": for input_ids, input_mask, segment_ids, label_ids, span, span_3 in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) span = span.to(device) span_3 = span_3.to(device) with torch.no_grad(): logits, loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, span=span, span_3=span_3) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss(ignore_index=-1) tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) else: for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits, loss = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss(ignore_index=-1) tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps pred = preds[0] if task_name == "sstphrase" or task_name == "sst-3": pred_ans = [] for i in pred: pred_ans.append(i) if output_mode == "classification": if task_name == "sstphrase" or task_name == "sst-3": preds = np.argmax(pred_ans, axis=-1) else: preds = np.argmax(pred, axis=-1) elif output_mode == "regression": preds = np.squeeze(pred_ans) if task_name == "emoint": ## to prevent segmentation fault error ## print(preds.tolist()) print(out_label_ids.tolist()) result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # output_eval_file = os.path.join(args.output_dir, "eval_results.txt") # with open(output_eval_file, "w") as writer: # logger.info("***** Eval results *****") # for key in sorted(result.keys()): # logger.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_graph = torch.tensor([f.graph for f in train_features], dtype=torch.long) all_span = torch.tensor([f.span for f in train_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_graph, all_label_ids, all_span) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--vocab_file", default='bert-base-uncased-vocab.txt', type=str, required=True) parser.add_argument("--model_file", default='bert-base-uncased.tar.gz', type=str, required=True) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument( "--predict_dir", default=None, type=str, required=True, help="The output directory where the predictions will be written.") # Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=2.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--view_id', type=int, default=1, help="view id of multi-view co-training(two-view)") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") # Base setting parser.add_argument('--pretrain', type=str, default=None) parser.add_argument('--max_ctx', type=int, default=2) parser.add_argument('--task_name', type=str, default='coqa_yesno') parser.add_argument('--bert_name', type=str, default='baseline') parser.add_argument('--reader_name', type=str, default='coqa') # model parameters parser.add_argument('--evidence_lambda', type=float, default=0.8) parser.add_argument('--tf_layers', type=int, default=1) parser.add_argument('--tf_inter_size', type=int, default=3072) # Parameters for running labeling model parser.add_argument('--do_label', default=False, action='store_true') parser.add_argument('--sentence_id_files', nargs='*') parser.add_argument('--weight_threshold', type=float, default=0.0) parser.add_argument('--only_correct', default=False, action='store_true') parser.add_argument('--label_threshold', type=float, default=0.0) args = parser.parse_args() logger = setting_logger(args.output_dir) logger.info('================== Program start. ========================') model_params = prepare_model_params(args) read_params = prepare_read_params(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if args.do_train: if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) if args.do_predict: os.makedirs(args.predict_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.vocab_file) data_reader = initialize_reader(args.reader_name) num_train_steps = None if args.do_train or args.do_label: train_examples = data_reader.read(input_file=args.train_file, **read_params) cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}_{4}_{5}'.format( args.bert_model, str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), str(args.max_ctx), str(args.task_name)) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except FileNotFoundError: train_features = data_reader.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) print(train_features[-1].unique_id) num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if args.pretrain is not None: logger.info('Load pretrained model from {}'.format(args.pretrain)) model_state_dict = torch.load(args.pretrain, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) else: model = initialize_model(args.bert_name, args.model_file, **model_params) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=t_total) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=t_total) # Prepare data eval_examples = data_reader.read(input_file=args.predict_file, **read_params) eval_features = data_reader.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) eval_tensors = data_reader.data_to_tensors(eval_features) eval_data = TensorDataset(*eval_tensors) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) if args.do_train: if args.do_label: logger.info('Training in State Wise.') sentence_id_file_list = args.sentence_id_files if sentence_id_file_list is not None: for file in sentence_id_file_list: train_features = data_reader.generate_features_sentence_ids( train_features, file) else: train_features = data_reader.mask_all_sentence_ids( train_features) logger.info('No sentence id supervision is found.') else: logger.info('Training in traditional way.') logger.info("Start training") train_loss = AverageMeter() best_acc = 0.0 summary_writer = SummaryWriter(log_dir=args.output_dir) global_step = 0 eval_loss = AverageMeter() train_tensors = data_reader.data_to_tensors(train_features) train_data = TensorDataset(*train_tensors) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for epoch in trange(int(args.num_train_epochs), desc="Epoch"): # Train model.train() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, train_features, do_label=args.do_label, model_state=ModelState.Train) loss = model(**inputs)['loss'] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step if args.fp16: summary_writer.add_scalar('lr', lr_this_step, global_step) else: summary_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) optimizer.step() optimizer.zero_grad() global_step += 1 train_loss.update(loss.item(), args.train_batch_size) summary_writer.add_scalar('train_loss', train_loss.avg, global_step) summary_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) # Evaluation model.eval() all_results = [] logger.info("Start evaluating") for eval_step, batch in enumerate( tqdm(eval_dataloader, desc="Evaluating")): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs( batch, eval_features, do_label=args.do_label, model_state=ModelState.Evaluate) with torch.no_grad(): output_dict = model(**inputs) loss, batch_choice_logits = output_dict[ 'loss'], output_dict['yesno_logits'] eval_loss.update(loss.item(), args.predict_batch_size) example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu( ).tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) summary_writer.add_scalar('eval_loss', eval_loss.avg, epoch) eval_loss.reset() data_reader.write_predictions(eval_examples, eval_features, all_results, None, null_score_diff_threshold=0.0) yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no') no_metric = data_reader.yesno_cate.f1_measure('no', 'yes') current_acc = yes_metric['accuracy'] summary_writer.add_scalar('eval_yes_f1', yes_metric['f1'], epoch) summary_writer.add_scalar('eval_yes_recall', yes_metric['recall'], epoch) summary_writer.add_scalar('eval_yes_precision', yes_metric['precision'], epoch) summary_writer.add_scalar('eval_no_f1', no_metric['f1'], epoch) summary_writer.add_scalar('eval_no_recall', no_metric['recall'], epoch) summary_writer.add_scalar('eval_no_precision', no_metric['precision'], epoch) summary_writer.add_scalar('eval_yesno_acc', current_acc, epoch) torch.cuda.empty_cache() if current_acc > best_acc: best_acc = current_acc model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) logger.info('Epoch: %d, Accuracy: %f (Best Accuracy: %f)' % (epoch, current_acc, best_acc)) data_reader.yesno_cate.reset() summary_writer.close() # Loading trained model. output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") model_state_dict = torch.load(output_model_file, map_location='cuda:0') model = initialize_model(args.bert_name, args.model_file, state_dict=model_state_dict, **model_params) model.to(device) # Write Yes/No predictions if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = eval_examples test_features = eval_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start predicting yes/no on Dev set.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device( batch, device) # multi-gpu does scattering it-self inputs = data_reader.generate_inputs(batch, test_features, do_label=args.do_label, model_state=ModelState.Test) with torch.no_grad(): batch_choice_logits = model(**inputs)['yesno_logits'] example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu().tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( RawResultChoice(unique_id=unique_id, choice_logits=choice_logits)) output_prediction_file = os.path.join(args.predict_dir, 'predictions.json') data_reader.write_predictions(eval_examples, eval_features, all_results, output_prediction_file, null_score_diff_threshold=0.0) yes_metric = data_reader.yesno_cate.f1_measure('yes', 'no') no_metric = data_reader.yesno_cate.f1_measure('no', 'yes') logger.info('Yes Metrics: %s' % json.dumps(yes_metric, indent=2)) logger.info('No Metrics: %s' % json.dumps(no_metric, indent=2)) # Labeling sentence id. if args.do_label and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = train_examples test_features = train_features test_tensors = data_reader.data_to_tensors(test_features) test_data = TensorDataset(*test_tensors) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.predict_batch_size) logger.info("***** Running labeling *****") logger.info(" Num orig examples = %d", len(test_examples)) logger.info(" Num split examples = %d", len(test_features)) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() all_results = [] logger.info("Start labeling.") for batch in tqdm(test_dataloader, desc="Testing"): if n_gpu == 1: batch = batch_to_device(batch, device) inputs = data_reader.generate_inputs(batch, test_features, do_label=args.do_label, model_state=ModelState.Test) with torch.no_grad(): output_dict = model(**inputs) batch_choice_logits = output_dict['yesno_logits'] batch_max_weight_indexes = output_dict['max_weight_index'] batch_max_weight = output_dict['max_weight'] example_indices = batch[-1] for i, example_index in enumerate(example_indices): choice_logits = batch_choice_logits[i].detach().cpu().tolist() max_weight_index = batch_max_weight_indexes[i].detach().cpu( ).tolist() max_weight = batch_max_weight[i].detach().cpu().tolist() test_feature = test_features[example_index.item()] unique_id = int(test_feature.unique_id) all_results.append( WeightResultChoice(unique_id=unique_id, choice_logits=choice_logits, max_weight_index=max_weight_index, max_weight=max_weight)) output_prediction_file = os.path.join(args.predict_dir, 'sentence_id_file.json') data_reader.predict_sentence_ids( test_examples, test_features, all_results, output_prediction_file, weight_threshold=args.weight_threshold, only_correct=args.only_correct, label_threshold=args.label_threshold)
def main(): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) processor = NLUDataProcessor(args.data_dir, args.max_seq_length, tokenizer) label_list = processor.get_labels() num_labels = len(label_list) num_intents = len(processor.intents) train_examples = None num_train_optimization_steps = None if args.do_train: restrict = {} if args.limit_data: for i in range(len(args.limit_data) // 2): intent = args.limit_data[i * 2] size = args.limit_data[i * 2 + 1] assert intent in processor.intents restrict[intent] = tuple(int(x) for x in size.split('*')) train_examples = processor.get_train_examples(restrict) num_train_optimization_steps = \ int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * \ args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = \ num_train_optimization_steps // torch.distributed.get_world_size() if args.eval_on_test: eval_examples = processor.get_test_examples() else: eval_examples = processor.get_dev_examples() # Prepare model cache_dir = args.cache_dir or os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForNLU.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels, num_intents=num_intents, from_tf=args.from_tf, layers=args.layers, prune=args.prune, dropout=args.dropout) if args.fp16: model.half() model.to(device) if args.local_rank != -1: from apex.parallel import DistributedDataParallel as DDP model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # distiller compression_scheduler = None if args.distiller is not None: import distiller distiller_pylogger = distiller.data_loggers.PythonLogger(logger) compression_scheduler = distiller.config.file_config( model, None, args.distiller) # Tensorboard swriter = SummaryWriter(args.output_dir) logger.info("Writing summary to %s", args.output_dir) # Prepare optimizer if args.do_train: if args.train_layers_from is not None: param_optimizer = [] for n, p in model.named_parameters(): if "classifier" in n or "pooler" in n: param_optimizer.append((n, p)) elif any( int(s) >= args.train_layers_from for s in re.findall(r'layer\.(\d+)\.', n)): param_optimizer.append((n, p)) else: print("Not considered trainable:", n) p.requires_grad_(False) else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule=None if args.const_lr else 'warmup_linear') global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_tokenids = torch.tensor( [f.input_tokenids for f in train_examples], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_examples], dtype=torch.long) all_input_segmentids = torch.tensor( [f.input_segmentids for f in train_examples], dtype=torch.long) all_input_labelids = torch.tensor( [f.input_labelids for f in train_examples], dtype=torch.long) train_data = TensorDataset(all_input_tokenids, all_input_mask, all_input_segmentids, all_input_labelids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) batches_per_epoch = int(len(train_examples) / args.train_batch_size) model.train() for epoch_id in trange(int(args.num_train_epochs), desc="Epoch"): if compression_scheduler: compression_scheduler.on_epoch_begin(epoch_id) nb_tr_examples = 0 global_step_tr_loss = 0.0 tqdm_bar = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(tqdm_bar): if compression_scheduler and step % args.gradient_accumulation_steps == 0: compression_scheduler.on_minibatch_begin( epoch_id, minibatch_id=step / args.gradient_accumulation_steps, minibatches_per_epoch=batches_per_epoch) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _, _ = model(input_ids, segment_ids, input_mask, labels=label_ids) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) loss = compression_scheduler.before_backward_pass( epoch_id, minibatch_id=step / args.gradient_accumulation_steps, minibatches_per_epoch=batches_per_epoch, loss=loss, return_loss_components=False) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() global_step_tr_loss += loss.item() nb_tr_examples += input_ids.size(0) if (step + 1) % args.gradient_accumulation_steps == 0: tqdm_bar.set_postfix(train_loss=global_step_tr_loss) swriter.add_scalar('train_loss', global_step_tr_loss, global_step=global_step) if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = \ args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step if args.fp16: for _, p in param_optimizer: if p.grad is None: p.grad = torch.zeros(p.size(), dtype=p.dtype, device=p.device) optimizer.step() global_step += 1 global_step_tr_loss = 0.0 if compression_scheduler: compression_scheduler.on_minibatch_end( epoch_id, minibatch_id=step / args.gradient_accumulation_steps, minibatches_per_epoch=batches_per_epoch) optimizer.zero_grad() if not args.fp16 and optimizer.get_lr(): # get_lr returns a list, however all the elements of the list should be the # same swriter.add_scalar('learning_rate', np.random.choice( optimizer.get_lr()), global_step=global_step) if global_step % args.eval_steps == 0: perform_evaluation(eval_examples=eval_examples, model=model, processor=processor, swriter=swriter, device=device, global_step=global_step) model.train() if global_step % args.save_checkpoints_steps == 0: save_model(model=model, tokenizer=tokenizer, global_step=global_step) if args.prune and global_step % args.eval_steps == 1: prune_model(model=model, swriter=swriter, global_step=global_step, count=args.prune_count) if compression_scheduler: sparsity_table, total_sparsity = \ distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True) logger.info("\nParameters:\n" + str(sparsity_table)) logger.info('Total sparsity: {:0.2f}\n'.format(total_sparsity)) swriter.add_scalar('sparsity', total_sparsity, global_step=global_step) compression_scheduler.on_epoch_end(epoch_id) save_model(model=model, tokenizer=tokenizer, global_step=global_step, tag='final') perform_evaluation(eval_examples=eval_examples, model=model, processor=processor, swriter=swriter, device=device, global_step=global_step) swriter.close()
def train(args, model, tokenizer, ngram_dict, processor, label_list): global_step = 0 if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() train_dataset = load_examples(args, tokenizer, ngram_dict, processor, label_list, mode="train") if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) for epoch_num in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): batch = tuple(t.to(args.device) for t in batch) input_ids, input_mask, segment_ids, label_ids, ngram_ids, ngram_positions, \ ngram_lengths, ngram_seg_ids, ngram_masks = batch loss = model(input_ids, ngram_ids, ngram_positions, labels=label_ids) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) save_zen_model(output_dir, model, tokenizer, ngram_dict, args)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "yelp": YelpProcessor, } num_labels_task = { "yelp": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps, exp_average_loss = 0, 0, None tqdm_bar = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() if n_gpu > 1: tmp_loss = loss.mean().item() else: tmp_loss = loss.item() exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = lr * warmup_linear.get_lr( global_step, warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (fp16): tb_writer.add_scalar('lr', lr_this_step, global_step) else: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) logger.info("epoch:{:d},step:{:d}/{:d},loss:{:.3f}".format( _, step, num_train_optimization_steps // num_train_epochs, loss.cpu().detach().item())) if (global_step % every_steps_save == 0): torch.save(model.state_dict(), output_model_file) if do_eval: data = json.load(open(os.path.join(data_dir, "dev.json"), 'r')) data = data[10000:] pre_all = 0 actual_all = 0 correct = 0 for i, example in enumerate(tqdm(data)): text = example["text"]
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--train_file", default="data/conceptual_caption/training", type=str, # required=True, help="The input train corpus.", ) parser.add_argument( "--validation_file", default="data/conceptual_caption/validation", type=str, # required=True, help="The input train corpus.", ) parser.add_argument( "--from_pretrained", default="", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, # required=True, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, # required=True, help="The config file which specified the model details.", ) ## Other parameters parser.add_argument( "--max_seq_length", default=36, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument("--predict_feature", action="store_true", help="visual target.") parser.add_argument( "--train_batch_size", default=512, type=int, help="Total batch size for training.", ) parser.add_argument( "--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--start_epoch", default=0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--img_weight", default=1, type=float, help="weight for image loss") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action="store_true", help="Whether to load train samples into memory or use disk", ) parser.add_argument( "--do_lower_case", type=bool, default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=3, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument("--baseline", action="store_true", help="Wheter to use the baseline model (single bert).") parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.") parser.add_argument("--use_chuncks", default=0, type=float, help="whether use chunck for parallel training.") parser.add_argument("--distributed", action="store_true", help="whether use chunck for parallel training.") parser.add_argument("--without_coattention", action="store_true", help="whether pair loss.") args = parser.parse_args() if args.baseline: from pytorch_pretrained_bert.modeling import BertConfig from vilbert.basebert import BertForMultiModalPreTraining else: from vilbert.vilbert import BertForMultiModalPreTraining, BertConfig print(args) if args.save_name is not '': timeStamp = args.save_name else: timeStamp = strftime("%d-%b-%y-%X-%a", gmtime()) timeStamp += "_{:0>6d}".format(random.randint(0, 10e6)) savePath = os.path.join(args.output_dir, timeStamp) if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if args.freeze > config.t_biattention_id[0]: config.fixed_t_layer = config.t_biattention_id[0] if args.without_coattention: config.with_coattention = False # save all the hidden parameters. with open(os.path.join(savePath, 'command.txt'), 'w') as f: print(args, file=f) # Python 3.x print('\n', file=f) print(config, file=f) bert_weight_name = json.load( open("config/" + args.from_pretrained + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) num_train_optimization_steps = None viz = TBlogger("logs", timeStamp) train_dataset = ConceptCapLoaderTrain( args.train_file, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=args.num_workers, distributed=args.distributed, ) validation_dataset = ConceptCapLoaderVal( args.validation_file, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=2, distributed=args.distributed, ) num_train_optimization_steps = ( int(train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * (args.num_train_epochs - args.start_epoch)) # if args.local_rank != -1: # num_train_optimization_steps = ( # num_train_optimization_steps // torch.distributed.get_world_size() # ) default_gpu = False if dist.is_available() and args.distributed: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True # pdb.set_trace() if args.predict_feature: config.v_target_size = 2048 config.predict_feature = True else: config.v_target_size = 1601 config.predict_feature = False if args.from_pretrained: model = BertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config) else: model = BertForMultiModalPreTraining(config) model.cuda() if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if 'embeddings' in name: bert_weight_name_filtered.append(name) elif 'encoder' in name: layer_num = name.split('.')[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # set different parameters for vision branch and lanugage branch. if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0, ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: if args.from_pretrained: optimizer = BertAdam( optimizer_grouped_parameters, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, ) else: optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, ) logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) startIterID = 0 global_step = 0 masked_loss_v_tmp = 0 masked_loss_t_tmp = 0 next_sentence_loss_tmp = 0 loss_tmp = 0 start_t = timer() # t1 = timer() for epochId in range(int(args.start_epoch), int(args.num_train_epochs)): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # iter_dataloader = iter(train_dataloader) for step, batch in enumerate(train_dataset): iterId = startIterID + step + (epochId * len(train_dataset)) # batch = iter_dataloader.next() batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, image_ids = ( batch) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) if args.without_coattention: next_sentence_loss = next_sentence_loss * 0 masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if math.isnan(loss.item()): pdb.set_trace() tr_loss += loss.item() rank = 0 if dist.is_available() and args.distributed: rank = dist.get_rank() else: rank = 0 viz.linePlot(iterId, loss.item(), "loss_" + str(rank), "train") viz.linePlot(iterId, masked_loss_t.item(), "masked_loss_t_" + str(rank), "train") viz.linePlot(iterId, masked_loss_v.item(), "masked_loss_v_" + str(rank), "train") viz.linePlot(iterId, next_sentence_loss.item(), "next_sentence_loss_" + str(rank), "train") # viz.linePlot(iterId, optimizer.get_lr()[0], 'learning_rate', 'train') loss_tmp += loss.item() masked_loss_v_tmp += masked_loss_v.item() masked_loss_t_tmp += masked_loss_t.item() next_sentence_loss_tmp += next_sentence_loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if step % 20 == 0 and step != 0: masked_loss_t_tmp = masked_loss_t_tmp / 20.0 masked_loss_v_tmp = masked_loss_v_tmp / 20.0 next_sentence_loss_tmp = next_sentence_loss_tmp / 20.0 loss_tmp = loss_tmp / 20.0 end_t = timer() timeStamp = strftime("%a %d %b %y %X", gmtime()) Ep = epochId + nb_tr_steps / float(len(train_dataset)) printFormat = "[%s][Ep: %.2f][Iter: %d][Time: %5.2fs][Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g][LR: %.8g]" printInfo = [ timeStamp, Ep, nb_tr_steps, end_t - start_t, loss_tmp, masked_loss_v_tmp, masked_loss_t_tmp, next_sentence_loss_tmp, optimizer.get_lr()[0], ] start_t = end_t print(printFormat % tuple(printInfo)) masked_loss_v_tmp = 0 masked_loss_t_tmp = 0 next_sentence_loss_tmp = 0 loss_tmp = 0 # Do the evaluation torch.set_grad_enabled(False) start_t = timer() numBatches = len(validation_dataset) eval_masked_loss_t = 0 eval_masked_loss_v = 0 eval_next_sentence_loss = 0 eval_total_loss = 0 model.eval() for step, batch in enumerate(validation_dataset): batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, image_ids = ( batch) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, lm_label_ids, image_label, image_target, is_next, ) masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() eval_masked_loss_t += masked_loss_t.item() eval_masked_loss_v += masked_loss_v.item() eval_next_sentence_loss += next_sentence_loss.item() eval_total_loss += loss.item() end_t = timer() delta_t = " Time: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % ('val', step + 1, numBatches)) sys.stdout.flush() eval_masked_loss_t = eval_masked_loss_t / float(numBatches) eval_masked_loss_v = eval_masked_loss_v / float(numBatches) eval_next_sentence_loss = eval_next_sentence_loss / float(numBatches) eval_total_loss = eval_total_loss / float(numBatches) printFormat = "Evaluation: [Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g]" printInfo = [ eval_total_loss, eval_masked_loss_v, eval_masked_loss_t, eval_next_sentence_loss ] print(printFormat % tuple(printInfo)) torch.set_grad_enabled(True) viz.linePlot(epochId, eval_total_loss, "loss_" + str(rank), "val") viz.linePlot(epochId, eval_masked_loss_t, "masked_loss_t_" + str(rank), "val") viz.linePlot(epochId, eval_masked_loss_v, "masked_loss_v_" + str(rank), "val") viz.linePlot(epochId, eval_next_sentence_loss, "next_sentence_loss_" + str(rank), "val") if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") torch.save(model_to_save.state_dict(), output_model_file)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', default='gpt2', type=str, choices=['gpt2', 'gpt2-medium', 'openai-gpt'], help='model name') parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.", choices=[ 'rocstories', 'sqa.q-subqs', 'squad.q', 'squad.q-q', 'squad.sf-q', 'hotpot.q-subqs', 'hotpot.q-subqs.comparison', 'hotpot.subqs-subas-q-a', 'hotpot.q-sfs-a' ]) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=32) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--patience', type=int, default=1) parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--no_input_lm_train', action='store_true', help="Use LM loss on input while training?") parser.add_argument('--no_input_lm_eval', action='store_true', help="Use LM loss on input while evaluating?") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--debug', action='store_true', help="Whether to use debug mode") # NB: local_rank args = parser.parse_args() print(args) output_dir = 'checkpoint/tn={}.mn={}.tbs={}.lr={}.nte={}.nilt={}.nile={}'.format( args.task_name, args.model_name, args.train_batch_size, args.learning_rate, args.num_train_epochs, args.no_input_lm_train, args.no_input_lm_eval) print('Saving to {}'.format(output_dir)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps eval_batch_size = 2 * args.train_batch_size if not os.path.exists(output_dir): os.makedirs(output_dir) elif args.overwrite_output_dir: print('Overwriting existing output directory', output_dir) shutil.rmtree(output_dir) os.makedirs(output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_' ] if args.task_name in mc_task_names else None tokenizer_class = get_tokenizer_class(args.model_name) tokenizer = tokenizer_class.from_pretrained(args.model_name, special_tokens=special_tokens) model_class = get_model_class(args.model_name, args.task_name) model = model_class.from_pretrained( args.model_name, num_special_tokens=len(special_tokens) if special_tokens else 0) # model, tokenizer, _ = load_model('checkpoint/tn=squad-questions-cond-lm.mn=gpt2-medium.tbs=8.lr=6.25e-05') if args.task_name in {'hotpot.q-sfs-a'}: a_sep_tokens = tokenizer.encode( (' ' if 'gpt2' in args.model_name else '') + a_sep) assert len( a_sep_tokens) == 1, 'A Separator "{}" is multi-token {}'.format( a_sep, a_sep_tokens) end_of_input_token = a_sep_tokens[0] else: q_sep_tokens = tokenizer.encode( (' ' if 'gpt2' in args.model_name else '') + q_sep) assert len( q_sep_tokens) == 1, 'Q Separator "{}" is multi-token {}'.format( q_sep, q_sep_tokens) end_of_input_token = q_sep_tokens[0] if args.fp16: model.half() model.to(device) special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens) \ if special_tokens else [] encoded_datasets_save_file = '{}/{}.encoded_datasets.train-dev.json'.format( DATA_DIR, args.task_name) if os.path.exists(encoded_datasets_save_file): logger.info("Loading encoded datasets...") with open(encoded_datasets_save_file, 'r') as f: encoded_datasets = json.load(f) else: def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding datasets...") train_dataset = load_dataset('train', args.task_name, args.debug, args.seed) eval_dataset = load_dataset('dev', args.task_name, args.debug, args.seed) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) with open(encoded_datasets_save_file, 'w') as f: json.dump(encoded_datasets, f) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 if args.task_name == 'rocstories': input_length = max( len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) else: input_length = max( len(seq) for dataset in encoded_datasets for seq in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model print('input_length =', input_length) # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, args.task_name, args.no_input_lm_train, args.no_input_lm_eval, end_of_input_token, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) # Train loop tb_writer = SummaryWriter(output_dir) global_step, nb_tr_example_visits, best_eval_loss = 0, 0, float('inf') patience_left = args.patience start_time = time.time() for epoch_no in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss, tr_batch_loss, nb_tr_steps = 0, 0, 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) if args.task_name in mc_task_names: input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] else: input_ids, lm_labels = batch loss = model(input_ids, lm_labels=lm_labels) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() tr_batch_loss += loss.item() nb_tr_example_visits += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 tb_writer.add_scalar('lr', optimizer.get_lr()[0], nb_tr_example_visits) tb_writer.add_scalar('loss', tr_batch_loss, nb_tr_example_visits) tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( tr_batch_loss, optimizer.get_lr()[0]) tr_batch_loss = 0 # Validation model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) if args.task_name in mc_task_names: input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) eval_batch_loss = args.lm_coef * lm_loss + mc_loss mc_logits = model(input_ids, mc_token_ids)[1] mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += eval_batch_loss.mean().item() eval_accuracy += tmp_eval_accuracy else: input_ids, lm_labels = batch with torch.no_grad(): lm_loss = model(input_ids, lm_labels=lm_labels) eval_loss += lm_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps tb_writer.add_scalar('eval_loss', eval_loss, nb_tr_example_visits) result = { 'eval_loss': eval_loss, 'train_loss': tr_loss / (nb_tr_steps / float(args.gradient_accumulation_steps)) } if args.task_name in mc_task_names: result['eval_accuracy'] = eval_accuracy / nb_eval_examples output_eval_file = os.path.join(output_dir, "eval_results_{}.txt".format(epoch_no)) with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Model saving and early stopping print('Epoch {} complete!'.format(epoch_no)) if eval_loss < best_eval_loss: print('Best loss so far! {} -> {}'.format(best_eval_loss, eval_loss)) best_eval_loss = eval_loss save_model(model, tokenizer, args, output_dir, 'model_epoch_{}.bin'.format(epoch_no), True) patience_left = args.patience else: print('Loss up from best epoch: {} -> {}'.format( best_eval_loss, eval_loss)) save_model(model, tokenizer, args, output_dir, 'model_epoch_{}.bin'.format(epoch_no), False) patience_left -= 1 if patience_left <= 0: print('Ran out of patience. Stopping training.') break print('Completed training in {}s!'.format(time.time() - start_time))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default="meta", type=str, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=2, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--random_sampling', action='store_true', help="random sampling instead of balanced sampling") parser.add_argument('--active_sampling', action='store_true', help="uses active sampling instead of balanced sampling") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() experiment.log_parameters(vars(args)) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() full_label_list = label_list[0] + label_list[1] + label_list[2] num_binary_labels = len(label_list[0]) num_span_labels = len(label_list[1]) num_multi_labels = len(label_list[2]) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab tokenizer = BertTokenizer("savedmodel/vocab.txt", never_split = stopper) model = BertForMetaClassification.from_pretrained(args.bert_model, num_binary_labels=num_binary_labels, num_span_labels=num_span_labels, num_multi_labels=num_multi_labels) if args.local_rank == 0: torch.distributed.barrier() if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 def save_model(model, outputdir, threshs, score): model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(outputdir, WEIGHTS_NAME) output_config_file = os.path.join(outputdir, CONFIG_NAME) TRESH_NAME = "thresholds.txt" output_thresh_file = os.path.join(outputdir, TRESH_NAME) print(f"Saving model with score of {score}") torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(outputdir) with open(output_thresh_file, "w") as text_file: text_file.write(str(score)) for thresh in threshs: text_file.write("\n") text_file.write(str(thresh)) def sigmoid(x): sigm = 1. / (1. + np.exp(-x)) return sigm class UnderSampler(Sampler): def __init__(self, label_mins, label_type_list, label_number_list, order_index_list = None): self.label_mins = label_mins self.label_type_list = label_type_list self.label_number_list = label_number_list self.order_index_list = order_index_list label_mins = self.label_mins label_type_list = self.label_type_list label_number_list = self.label_number_list index_list = [] counter_dict = defaultdict(int) if not order_index_list: randomlist = list(range(len(label_type_list))) random.shuffle(randomlist) else: randomlist = order_index_list for i in randomlist: current_label = label_type_list[i] current_label_number = label_number_list[i] current_label_min = label_mins[current_label] if current_label_min > counter_dict[str(current_label)+ "_" + str(current_label_number)]: counter_dict[str(current_label)+"_" + str(current_label_number)] += 1 index_list.append(i) random.shuffle(index_list) self.index_list_len = len(index_list) def __iter__(self): label_mins = self.label_mins label_type_list = self.label_type_list label_number_list = self.label_number_list order_index_list = self.order_index_list index_list = [] counter_dict = defaultdict(int) if not order_index_list: randomlist = list(range(len(label_type_list))) random.shuffle(randomlist) else: randomlist = order_index_list for i in randomlist: current_label = label_type_list[i] current_label_number = label_number_list[i] current_label_min = label_mins[current_label] if current_label_min > counter_dict[str(current_label)+ "_" + str(current_label_number)]: counter_dict[str(current_label)+"_" + str(current_label_number)] += 1 index_list.append(i) random.shuffle(index_list) return iter(index_list) def __len__(self): return self.index_list_len ### Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = eval_examples logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f["input_ids"] for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f["input_mask"] for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f["segment_ids"] for f in eval_features], dtype=torch.long) input_len = all_input_ids.size(1) def index2onehot(features, keyname): returnlist = [] for f in features: cur_list = [] for position_i in range(input_len): if position_i in f[keyname]: cur_list.append(1) else: cur_list.append(0) returnlist.append(cur_list) return returnlist def labelindex2binary(label, newline_mask, input_len, ignore= -1): zeros = [ignore]* input_len for maskid, mask in enumerate(newline_mask): zeros[mask] = label[maskid] return zeros newline_mask = torch.tensor(index2onehot(eval_features, "[newline]"), dtype=torch.long) #newline_mask = torch.tensor([f["Newline"] for f in train_features], dtype=torch.long) list_binary_labels = [] for lb in label_list[0]: list_binary_labels.append(torch.tensor([labelindex2binary(f[lb], f["[newline]"], input_len=input_len, ignore=-1) for f in eval_features], dtype=torch.long)) list_span_labels = [] for lb in label_list[1]: list_span_labels.append(torch.tensor([f[lb] for f in eval_features], dtype=torch.long)) list_multi_labels = [] for lb in label_list[2]: list_multi_labels.append(torch.tensor([labelindex2binary(f[lb[0]], f["[newline]"], input_len=input_len, ignore=-1) for f in eval_features], dtype=torch.long)) pos_weights = [] for lb in label_list[0]: pos_cases = 0 neg_cases = 0 for example in eval_features: cur_arr = np.array(example[lb]) cur_arr = cur_arr[cur_arr != -1] size = cur_arr.size pos = cur_arr.sum() pos_cases += pos neg_cases = neg_cases + size - pos if pos_cases > 0: ratio = neg_cases / pos_cases else: ratio = 1.0 pos_weights.append(ratio) experiment.log_metric(f"positive test labels for class: {lb}",pos_cases) experiment.log_metric(f"negative test labels for class: {lb}",neg_cases) pos_weights = torch.tensor(pos_weights) #pos_weights = [pos_weights] * len(eval_features) pos_weights = pos_weights.expand(all_input_ids.size(0), -1) # prepare label information for undersampler label_mins = [defaultdict(int)] * len(full_label_list) label_type_list = ["x"] * len(eval_features) label_number_list = ["x"] * len(eval_features) for lbid, lb in enumerate(full_label_list): if type(lb) == list: lb = lb[0] for exid, example in enumerate(eval_features): cur_arr = example[lb] arr_number = sum(cur_arr) + len(cur_arr) if arr_number != 0: label_number_list[exid] = arr_number label_type_list[exid] = lbid label_mins[lbid][arr_number] += 1 assert ("x" not in label_type_list) assert ("x" not in label_number_list) for lid, lm in enumerate(label_mins): label_mins[lid] = min(lm.values()) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, newline_mask, pos_weights, *list_binary_labels, *list_span_labels, *list_multi_labels) # Run prediction for full data if args.local_rank == -1: # if args.random_sampling: eval_sampler = SequentialSampler(eval_data) # else: #eval_sampler = UnderSampler( label_mins, label_type_list, label_number_list) else: eval_sampler = DistributedSampler(eval_data) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) def f1_calculate(precision, recall): num =(2 * precision * recall).astype(float) den = (precision + recall).astype(float) try: f1 = np.divide(num, den, out=np.zeros_like(num), where=den>0.0001) except: import pdb; pdb.set_trace() return f1 def evaluate(number_of_epochs=0, show_examples=True, best_sum_of_scores=0.0): model.eval() eval_loss = 0 bce_loss = 0 cross_loss = 0 token_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None result = 0 result = defaultdict(float) len_bce = len(eval_dataloader) evaldict = defaultdict(partial(np.ndarray, 0)) thresholds = np.around(np.arange(-10,10, 0.1), decimals=1).tolist() thresholds= list(dict.fromkeys(thresholds)) bnum = 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): bnum += 1 batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, newline_mask, pos_weights, *label_id_list = batch with torch.no_grad(): logits,loss, loss_list = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, newline_mask= newline_mask, labels=label_id_list, pos_weights=pos_weights) eval_loss += loss.mean().item() bce_loss += loss_list[0].mean().item() cross_loss += loss_list[2].mean().item() token_loss += loss_list[1].mean().item() bce_logits = logits[0] token_logits = logits[1] evaldict["binary_mask"] = np.append(evaldict["binary_mask"], newline_mask.detach().cpu().numpy()) for l_id, label in enumerate(label_list[0]): cur_labels = label_id_list[l_id] bin_label_len = len(cur_labels) cur_logits = bce_logits[:, :, l_id] cur_logits_n = cur_logits.detach().cpu().numpy() cur_labels_n = cur_labels.detach().cpu().numpy() evaldict[label +"logits"] = np.append(evaldict[label +"logits"], cur_logits_n) evaldict[label +"labels"]= np.append(evaldict[label +"labels"], cur_labels_n) if (l_id == 0 or l_id == 1) and bnum < 5: mask = newline_mask[0].detach().cpu().numpy() text = " ".join(tokenizer.convert_ids_to_tokens(input_ids.cpu().numpy().tolist()[0])) print("\n\n1. TEXT:\n") print(text) print("\n\n2. LOGITS: \n") print(sigmoid(cur_logits[0].cpu().numpy())[mask == 1]) print("\n\n3. LABELS: \n") print(cur_labels[0].cpu().numpy()[mask == 1]) print("\n\n\n") # for thresh in thresholds: # threshed_logs = cur_logits > thresh # threshed_logs = ((cur_logits == 0).float() * -100).float() + (cur_logits != 0).float() * threshed_logs.float() # cur_labels_cpu = ((cur_logits == 0).float() * -100.0).float() + (cur_logits != 0).float() * cur_labels.float() # cur_labels_cpu = cur_labels_cpu.detach().numpy() # threshed_logs = threshed_logs.detach().numpy() # ignoring= (cur_logits == 0).sum().detach().numpy() # threshed_logs = threshed_logs[threshed_logs != -100] # cur_labels_cpu = cur_labels_cpu[cur_labels_cpu != -100] # # acc = ((cur_labels_cpu == threshed_logs).sum() - ignoring) / (cur_labels_cpu.size - ignoring) # acc = (cur_labels_cpu == threshed_logs).sum() / cur_labels_cpu.size # f1= f1_score(cur_labels_cpu, threshed_logs) # #import pdb; pdb.set_trace() # #acc = compute_metrics("meta", threshed_logs, cur_labels_cpu, ignoring) # # if label == "new_topic" and thresh == -1.0: # #import pdb; pdb.set_trace() # result[str(thresh)+ "_" + label + "_f1"] += f1 / len_bce # result[str(thresh)+ "_" + label + "_acc"] += acc / len_bce for l_id, label in enumerate(label_list[1]): cur_labels = label_id_list[l_id +len(label_list[0])] cur_logits = token_logits[:, :, l_id] cur_logits_n = cur_logits.detach().cpu().numpy() cur_labels_n = cur_labels.detach().cpu().numpy() evaldict[label +"logits"] = np.append(evaldict[label +"logits"], cur_logits_n) evaldict[label +"labels"]= np.append(evaldict[label +"labels"], cur_labels_n) if (l_id == 0) and bnum < 5: text = " ".join(tokenizer.convert_ids_to_tokens(input_ids.cpu().numpy().tolist()[0])) print("\n\n1. TEXT:\n") print(text) print("\n\n2. LOGITS: \n") print(sigmoid(cur_logits[0].cpu().numpy())) print("\n\n3. LABELS: \n") print(cur_labels[0].cpu().numpy()) print("\n\n\n") nb_eval_steps += 1 # if len(preds) == 0: # preds.append(logits.detach().cpu().numpy()) # out_label_ids = label_ids.detach().cpu().numpy() # else: # preds[0] = np.append( # preds[0], logits.detach().cpu().numpy(), axis=0) # out_label_ids = np.append( # out_label_ids, label_ids.detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps # preds = preds[0] # if output_mode == "classification": # preds = np.argmax(preds, axis=1) # elif output_mode == "regression": # preds = np.squeeze(preds) # result = compute_metrics(task_name, preds, out_label_ids) # bestf1 = 0 # bestf1name = "" # for key in sorted(result.keys()): # if "f1" in key and "new_topic" in key: # if result[key] > bestf1: # bestf1 = result[key] # bestf1name = float(key.replace("_f1", "").replace("new_topic", "").replace("_", "")) # result = 0 # result = defaultdict(float) # result["zbestf1_"] = bestf1 # result["zbestf1_threshold"] = bestf1name for l_id, label in enumerate(label_list[0]): binary_mask = evaldict["binary_mask"] cur_labels = evaldict[label +"labels"] cur_preds = evaldict[label+"logits"] cur_labels = cur_labels[binary_mask == 1] cur_preds = cur_preds[binary_mask == 1] cur_ignore = cur_labels != -1 cur_labels = cur_labels[cur_ignore] cur_preds = cur_preds[cur_ignore] cur_preds = sigmoid(cur_preds) try: if len(cur_labels) == 0: precision = np.array([0.0]) recall = np.array([0.0]) thresh = np.array([0.0]) else: precision, recall, thresh = precision_recall_curve(cur_labels, cur_preds) except: import pdb; pdb.set_trace() all_f1 = f1_calculate(precision, recall) maxindex = np.argmax(all_f1) result[label+"_best_thresh"] = thresh[maxindex] best_tresh = thresh[maxindex] if len(cur_labels) > 0: threshed_val = cur_preds > best_tresh conf = confusion_matrix(cur_labels, threshed_val) print(f"Confusion Matrix for {label}\n") print(conf) result[label+"_best_f1"] = all_f1[maxindex] result[label+"atbf1_best_precision"] = precision[maxindex] result[label+"atbf1_best_recall"] = recall[maxindex] if len(cur_labels) == 0: result[label +"_pr_auc_score"] = 0.0 else: result[label +"_pr_auc_score"] = auc(recall, precision) # token metrics for l_id, label in enumerate(label_list[1]): cur_labels = evaldict[label +"labels"] cur_preds = evaldict[label+"logits"] cur_ignore = cur_labels != -1 cur_labels = cur_labels[cur_ignore] cur_preds = cur_preds[cur_ignore] cur_preds = sigmoid(cur_preds) try: if len(cur_labels) == 0: precision = np.array([0.0]) recall = np.array([0.0]) thresh = np.array([0.0]) else: precision, recall, thresh = precision_recall_curve(cur_labels, cur_preds) except: import pdb; pdb.set_trace() all_f1 = f1_calculate(precision, recall) maxindex = np.argmax(all_f1) result[label+"_best_thresh"] = thresh[maxindex] best_tresh = thresh[maxindex] if len(cur_labels) > 0: threshed_val = cur_preds > best_tresh conf = confusion_matrix(cur_labels, threshed_val) print(f"Confusion Matrix for {label}\n") print(conf) result[label+"_best_f1"] = all_f1[maxindex] result[label+"_f1_best_precision"] = precision[maxindex] result[label+"_f1_best_recall"] = recall[maxindex] if len(cur_labels) == 0: result[label +"_pr_auc_score"] = 0.0 else: result[label +"_pr_auc_score"] = auc(recall, precision) if global_step == 0: loss = tr_loss/1 else: loss = tr_loss/global_step if args.do_train else None #result = {} result['eval_loss'] = eval_loss result["bce_loss"] = bce_loss result["cross_loss"] = cross_loss result["token_loss"] = token_loss result['global_step'] = global_step result['loss'] = loss for key in sorted(result.keys()): experiment.log_metric(key,result[key], number_of_epochs) # output_eval_file = os.path.join(args.output_dir, "eval_results.txt") # with open(output_eval_file, "w") as writer: # logger.info("***** Eval results *****") # for key in sorted(result.keys()): # logger.info(" %s = %s", key, str(result[key])) # writer.write("%s = %s\n" % (key, str(result[key]))) important_keys = ["self_con","secondary_relevance", "topic_words"] sum_of_scores = 0.0 for ikd, ik in enumerate(important_keys): sum_of_scores += result[ik + "_pr_auc_score"] if ikd == 0: sum_of_scores += result[ik + "_pr_auc_score"] if sum_of_scores > best_sum_of_scores: threshs = [ result[ts+"_best_thresh"] for ts in important_keys] save_model(model, args.output_dir, threshs, sum_of_scores/4) best_sum_of_scores = sum_of_scores return best_sum_of_scores if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_features = processor.get_train_examples(args.data_dir) train_examples = train_features all_input_ids = torch.tensor([f["input_ids"] for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f["input_mask"] for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f["segment_ids"] for f in train_features], dtype=torch.long) input_len = all_input_ids.size(1) def index2onehot(features, keyname): returnlist = [] for f in features: cur_list = [] for position_i in range(input_len): if position_i in f[keyname]: cur_list.append(1) else: cur_list.append(0) returnlist.append(cur_list) return returnlist def labelindex2binary(label, newline_mask, input_len, ignore=-1): zeros = [ignore]* input_len for maskid, mask in enumerate(newline_mask): zeros[mask] = label[maskid] return zeros newline_mask = torch.tensor(index2onehot(train_features, "[newline]"), dtype=torch.long) #newline_mask = torch.tensor([f["Newline"] for f in train_features], dtype=torch.long) list_binary_labels = [] for lb in label_list[0]: list_binary_labels.append(torch.tensor([labelindex2binary(f[lb], f["[newline]"], input_len=input_len) for f in train_features], dtype=torch.long)) pos_weights = [] for lbid, lb in enumerate(label_list[0]): pos_cases = 0 neg_cases = 0 for example in train_features: cur_arr = np.array(example[lb]) cur_arr = cur_arr[cur_arr != -1] size = cur_arr.size pos = cur_arr.sum() pos_cases += pos neg_cases = neg_cases + size - pos if pos_cases > 0: ratio = neg_cases / pos_cases else: ratio = 1.0 pos_weights.append(ratio) experiment.log_metric(f"positive training labels for class: {lb}",pos_cases) experiment.log_metric(f"negative training labels for class: {lb}",neg_cases) pos_weights = torch.tensor(pos_weights) #pos_weights = [pos_weights] * len(train_features) #pos_weights = None pos_weights = pos_weights.expand(all_input_ids.size(0), -1) list_span_labels = [] for lb in label_list[1]: list_span_labels.append(torch.tensor([f[lb] for f in train_features], dtype=torch.long)) list_multi_labels = [] for lb in label_list[2]: list_multi_labels.append(torch.tensor([labelindex2binary(f[lb[0]], f["[newline]"], input_len=input_len, ignore=-1) for f in train_features], dtype=torch.long)) # prepare label information for undersampler label_mins = [defaultdict(int)] * len(full_label_list) label_type_list = ["x"] * len(train_features) label_number_list = ["x"] * len(train_features) for lbid, lb in enumerate(full_label_list): if type(lb) == list: lb = lb[0] for exid, example in enumerate(train_features): cur_arr = example[lb] arr_number = sum(cur_arr) + len(cur_arr) if arr_number != 0: label_number_list[exid] = arr_number label_type_list[exid] = lbid label_mins[lbid][arr_number] += 1 assert ("x" not in label_type_list) assert ("x" not in label_number_list) for lid, lm in enumerate(label_mins): label_mins[lid] = min(lm.values()) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, newline_mask, pos_weights, *list_binary_labels, *list_span_labels, *list_multi_labels) if args.local_rank == -1: # if args.weighted_sampling: # class_weightings = [] # for f in train_features: # for lblist in label_list: # for lb in lblist: # if type(lb) == list: # lb = lb[0] # f["activelist"] #train_sampler = RandomWeightedSampler(train_data) #else: if args.random_sampling: train_sampler = RandomSampler(train_data) else: train_sampler = UnderSampler( label_mins, label_type_list, label_number_list) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs def sample_active(label_mins, label_type_list, label_number_list ,train_data,model): """ Goes through each train example and evaluates them. The indices of the ranking are then used to create a new Undersampler and then a new dataloader is returned """ resultlist = [] sample_dataloader = train_dataloader = DataLoader(train_data, sampler=SequentialSampler(train_data), batch_size=1) for sampleid, batch in enumerate(tqdm(sample_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, newline_mask, pos_weights, *label_id_list = batch with torch.no_grad(): logits, loss, loss_list = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, newline_mask= newline_mask, labels=label_id_list, pos_weights=pos_weights) loss = loss.detach().cpu().numpy() resultlist.append(loss) sample_dataloader = 0 resultlist = np.array(resultlist) sorted_resultlist = np.argsort(resultlist).tolist() sorted_resultlist.reverse() new_sampler = UnderSampler( label_mins, label_type_list, label_number_list, sorted_resultlist) return_dataloader = DataLoader(train_data, sampler=new_sampler, batch_size=args.train_batch_size) return return_dataloader # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) number_of_epochs = -1 best_sum_of_scores = 0.0 model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): tr_loss = 0 number_of_epochs += 1 nb_tr_examples, nb_tr_steps = 0, 0 if number_of_epochs % 1 == 0: best_sum_of_scores =evaluate(number_of_epochs=number_of_epochs ,best_sum_of_scores = best_sum_of_scores) if number_of_epochs > 0 and args.active_sampling: train_dataloader = sample_active(label_mins, label_type_list, label_number_list ,train_data, model) model.train() for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, newline_mask, pos_weights, *label_id_list = batch # define a new function to compute loss values for both output_modes logits, loss, loss_list = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, newline_mask= newline_mask, labels=label_id_list, pos_weights=pos_weights) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) experiment.log_metric("lr",optimizer.get_lr()[0], global_step) experiment.log_metric("train_loss",loss.item(), global_step) ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() ### Example: if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMetaClassification.from_pretrained(args.output_dir, num_binary_labels=num_binary_labels, num_span_labels=num_span_labels, num_multi_labels=num_multi_labels) tokenizer = BertTokenizer("savedmodel/vocab.txt", never_split = stopper) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) else: model = BertForMetaClassification.from_pretrained(args.bert_model, num_binary_labels=num_binary_labels, num_span_labels=num_span_labels, num_multi_labels=num_multi_labels) model.to(device)
def train(args, model, processor, tokenizer, device, n_gpu): global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() data, num_examples = features(args, processor, "train", tokenizer) data = TensorDataset(*data) if args.local_rank == -1: sampler = RandomSampler(data) else: sampler = DistributedSampler(data) data_loader = DataLoader(data, sampler=sampler, batch_size=args.train_batch_size) step_size = args.gradient_accumulation_steps * args.num_train_epochs num_train_optimization_steps = len(data_loader) // step_size # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex to use " "distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", num_examples) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() loss_fct = MarginRankingLoss(margin=args.margin) ckpt_num = 0 eval_results_history = [] best = 0. best_props = {} eval_result = None no_improvement = 0 t = time.time() try: for num_epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 if no_improvement > args.tolerance: logger.info( "No improvement in last %d evaluations, early stopping") logger.info( "epoch: {} | nb_tr_steps: {} | global_step: {} | tr_loss: {}" .format(num_epoch, nb_tr_steps, global_step, tr_loss)) for step, batch in enumerate(tqdm(data_loader, desc="Iteration")): print(nb_tr_steps) batch = tuple(t.to(device) for t in batch) input_ids, segment_ids, mask_ids = batch # <question, +ve doc> pairs input_ids_qp, segment_ids_qp, input_mask_qp = \ input_ids[:, 0, :], segment_ids[:, 0, :], mask_ids[:, 0, :] # <question, -ve doc> pairs input_ids_qn, segment_ids_qn, input_mask_qn = \ input_ids[:, 1, :], segment_ids[:, 1, :], mask_ids[:, 1, :] pos_scores = model(input_ids_qp, segment_ids_qp, input_mask_qp) neg_scores = model(input_ids_qn, segment_ids_qn, input_mask_qn) # y all 1s to indicate positive should be higher y = torch.ones(len(pos_scores)).float().to(device) loss = loss_fct(pos_scores, neg_scores, y) if nb_tr_steps % 10 == 0 and nb_tr_steps != 0: logger.info("+ve scores : %r" % pos_scores) logger.info("-ve scores : %r" % neg_scores) logger.info("Train step loss : %0.5f" % loss.item()) if global_step > 0: logger.info("Train total loss : %0.5f" % (tr_loss / global_step)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles # this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) if nb_tr_steps % config.eval_every_step == 0 and nb_tr_steps != 0: eval_result = eval(args, model, processor, tokenizer, device, tr_loss, global_step) if eval_result["f1"] >= best: save( model, "%s_%0.3f_%0.3f_%0.3f" % (args.model_name, eval_result["precision"], eval_result["recall"], eval_result["f1"]), args, tokenizer, ckpt_num) best = eval_result["f1"] best_props["num_epoch"] = num_epoch best_props["nb_tr_steps"] = nb_tr_steps best_props["tr_loss"] = tr_loss / global_step best_props["ckpt_num"] = ckpt_num best_props["global_step"] = global_step best_props["eval_result"] = eval_result with open(os.path.join(config.output_dir, "best.json"), "w") as wf: json.dump(best_props, wf, indent=2) # make predictions with best model for i in range(1, 6): predict(args, model, processor, tokenizer, device, i) no_improvement = 0 else: no_improvement += 1 ckpt_num += 1 eval_results_history.append((ckpt_num, eval_result)) except KeyboardInterrupt: logger.info("Training interrupted!") if eval_result is not None: save( model, "%s_%0.3f_%0.3f_%0.3f_interrupted" % (args.model_name, eval_result["precision"], eval_result["recall"], eval_result["f1"]), args, tokenizer, ckpt_num) t = time.time() - t logger.info("Training took %0.3f seconds" % t) loss = tr_loss / global_step logger.info("Final training loss %0.5f" % loss) logger.info("Best F1-score on eval set : %0.3f" % best) logger.info("***** Eval best props *****") for key in sorted(best_props.keys()): if key != "eval_result": logger.info(" %s = %s", key, str(best_props[key])) else: for eval_key in sorted(best_props[key].keys()): logger.info(" %s = %s", eval_key, str(best_props[key][eval_key])) with open(os.path.join(config.output_dir, "eval_results_history.pkl"), "wb") as wf: pickle.dump(eval_results_history, wf)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForMultipleChoice.from_pretrained( args.bert_model, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), num_choices=2) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_examples = read_copa_examples(os.path.join( args.data_dir, 'train.jsonl'), is_training=True) train_features = convert_examples_to_features(train_examples, tokenizer, args.max_seq_length, True) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultipleChoice.from_pretrained(args.output_dir, num_choices=2) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, num_choices=2) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_copa_examples(os.path.join( args.data_dir, 'val.jsonl'), is_training=True) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() print(logits, label_ids) label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / global_step } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))