def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--train_data_dir", default="", type=str, # required=True, help="The input train corpus.", ) parser.add_argument( "--val_data_dir", default="", type=str, # required=True, help="The input val corpus.", ) parser.add_argument( "--from_pretrained", default="", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, # required=True, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_interbert.json", type=str, # required=True, help="The config file which specified the model details.", ) ## Other parameters parser.add_argument( "--max_seq_length", default=36, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.", ) parser.add_argument("--predict_feature", action="store_true", help="visual target.") parser.add_argument( "--train_batch_size", default=512, type=int, help="Total batch size for training.", ) parser.add_argument( "--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.", ) parser.add_argument( "--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--start_epoch", default=0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--continue_training", action="store_true", help="if we need to continue a stopped pretraining procedure, add this" ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--img_weight", default=1, type=float, help="weight for image loss") parser.add_argument("--itm_weight", default=1, type=float, help="weight for itm loss") parser.add_argument("--text_weight", default=1, type=float, help="weight for text loss") parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", type=bool, default=True, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=3, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of the model need to fixed.") parser.add_argument("--distributed", action="store_true", help="whether use chunck for parallel training.") parser.add_argument("--without_coattention", action="store_true", help="whether pair loss.") parser.add_argument("--span_mask", action="store_true", help="whether to use span_masking.") parser.add_argument("--cond_mask", action="store_true", help="Whether to use conditional masking method.") parser.add_argument("--dynamic_masking", action="store_true", help="whether to use dynamic masking") args = parser.parse_args() print(args) if args.save_name is not '': timeStamp = args.save_name else: timeStamp = strftime("%d-%b-%y-%X-%a", gmtime()) timeStamp += "_{:0>6d}".format(random.randint(0, 10e6)) savePath = os.path.join(args.output_dir, timeStamp) if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if args.freeze > config.t_biattention_id[0]: config.fixed_t_layer = config.t_biattention_id[0] if args.without_coattention: config.with_coattention = False # # save all the hidden parameters. # with open(os.path.join(savePath, 'command.txt'), 'w') as f: # print(args, file=f) # Python 3.x # print('\n', file=f) # print(config, file=f) bert_weight_name = json.load( open("config/" + "bert-base-uncased_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) num_train_optimization_steps = None viz = TBlogger("logs", timeStamp) train_dataset = ConceptCapLoaderTrain(args.train_data_dir, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=args.num_workers, distributed=args.distributed, span_mask=args.span_mask, cond_mask=args.cond_mask) validation_dataset = ConceptCapLoaderVal( args.val_data_dir, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=2, distributed=args.distributed, span_mask=args.span_mask, cond_mask=args.cond_mask) if args.continue_training: assert args.start_epoch > 0 # must have pretrained at least one epoch num_train_optimization_steps = ( int(train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs) if args.cond_mask: num_train_optimization_steps *= 2 finished_steps = ( int(train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * args.start_epoch) if args.cond_mask: finished_steps *= 2 else: num_train_optimization_steps = ( int(train_dataset.num_dataset / args.train_batch_size / args.gradient_accumulation_steps) * (args.num_train_epochs - args.start_epoch)) if args.cond_mask: num_train_optimization_steps *= 2 finished_steps = 0 default_gpu = False if dist.is_available() and args.distributed: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, 'command.txt'), 'w') as f: print(args, file=f) # Python 3.x print('\n', file=f) print(config, file=f) # pdb.set_trace() if args.predict_feature: config.v_target_size = 2048 config.predict_feature = True else: config.v_target_size = 1601 config.predict_feature = False if args.from_pretrained: if args.continue_training: ckpt_load_path = os.path.join( args.from_pretrained, "pytorch_model_{}.bin".format(int(args.start_epoch) - 1)) model = InterBertForMultiModalPreTraining.from_pretrained( ckpt_load_path, config) else: model = InterBertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config) else: model = InterBertForMultiModalPreTraining(config) model.cuda() if args.fp16: model.half() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if 'embeddings' in name: bert_weight_name_filtered.append(name) elif 'encoder' in name: layer_num = name.split('.')[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) if not args.from_pretrained: param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] else: optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if key[12:] in bert_weight_name: lr = args.learning_rate * 0.1 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # set different parameters for vision branch and lanugage branch. if args.fp16: try: from apex.contrib.optimizers import FP16_Optimizer from apex.contrib.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam( optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0, ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: if args.from_pretrained: optimizer = BertAdam( optimizer_grouped_parameters, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, ) else: optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, ) if args.continue_training: opt_state_dict_path = os.path.join( args.from_pretrained, "optimizer_state_{}.bin".format(int(args.start_epoch) - 1)) optimizer.load_state_dict( torch.load(opt_state_dict_path, map_location='cpu')) logger.info("***** Running training *****") logger.info(" Num examples = %d", train_dataset.num_dataset) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps - finished_steps) startIterID = 0 global_step = finished_steps masked_loss_v_tmp = 0 masked_loss_t_tmp = 0 next_sentence_loss_tmp = 0 loss_tmp = 0 start_t = timer() for epochId in range(int(args.start_epoch), int(args.num_train_epochs)): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # iter_dataloader = iter(train_dataloader) for step, batch in enumerate(train_dataset): iterId = startIterID + step + (epochId * len(train_dataset)) # batch = iter_dataloader.next() batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, multimodal_mask, image_ids = ( batch) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, multimodal_mask, lm_label_ids, image_label, image_target, is_next, ) if args.without_coattention: next_sentence_loss = next_sentence_loss * 0 masked_loss_v = masked_loss_v * args.img_weight next_sentence_loss = next_sentence_loss * args.itm_weight loss = masked_loss_t * args.text_weight + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if math.isnan(loss.item()): pdb.set_trace() tr_loss += loss.item() rank = 0 if dist.is_available() and args.distributed: rank = dist.get_rank() else: rank = 0 viz.linePlot(iterId, loss.item(), "loss_" + str(rank), "train") viz.linePlot(iterId, masked_loss_t.item(), "masked_loss_t_" + str(rank), "train") viz.linePlot(iterId, masked_loss_v.item(), "masked_loss_v_" + str(rank), "train") viz.linePlot(iterId, next_sentence_loss.item(), "next_sentence_loss_" + str(rank), "train") # viz.linePlot(iterId, optimizer.get_lr()[0], 'learning_rate', 'train') loss_tmp += loss.item() masked_loss_v_tmp += masked_loss_v.item() masked_loss_t_tmp += masked_loss_t.item() next_sentence_loss_tmp += next_sentence_loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if step % 20 == 0 and step != 0: masked_loss_t_tmp = masked_loss_t_tmp / 20.0 masked_loss_v_tmp = masked_loss_v_tmp / 20.0 next_sentence_loss_tmp = next_sentence_loss_tmp / 20.0 loss_tmp = loss_tmp / 20.0 end_t = timer() timeStamp = strftime("%a %d %b %y %X", gmtime()) Ep = epochId + nb_tr_steps / float(len(train_dataset)) printFormat = "[%s][Ep: %.2f][Iter: %d][Time: %5.2fs][Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g][LR: %.8g]" printInfo = [ timeStamp, Ep, nb_tr_steps, end_t - start_t, loss_tmp, masked_loss_v_tmp, masked_loss_t_tmp, next_sentence_loss_tmp, optimizer.get_lr()[0], ] start_t = end_t print(printFormat % tuple(printInfo)) masked_loss_v_tmp = 0 masked_loss_t_tmp = 0 next_sentence_loss_tmp = 0 loss_tmp = 0 # Do the evaluation torch.set_grad_enabled(False) start_t = timer() numBatches = len(validation_dataset) eval_masked_loss_t = 0 eval_masked_loss_v = 0 eval_next_sentence_loss = 0 eval_total_loss = 0 model.eval() for step, batch in enumerate(validation_dataset): batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, image_feat, image_loc, image_target, image_label, image_mask, multimodal_mask, image_ids = ( batch) masked_loss_t, masked_loss_v, next_sentence_loss = model( input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask, multimodal_mask, lm_label_ids, image_label, image_target, is_next, ) masked_loss_v = masked_loss_v * args.img_weight loss = masked_loss_t + masked_loss_v + next_sentence_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. masked_loss_t = masked_loss_t.mean() masked_loss_v = masked_loss_v.mean() next_sentence_loss = next_sentence_loss.mean() eval_masked_loss_t += masked_loss_t.item() eval_masked_loss_v += masked_loss_v.item() eval_next_sentence_loss += next_sentence_loss.item() eval_total_loss += loss.item() end_t = timer() delta_t = " Time: %5.2fs" % (end_t - start_t) start_t = end_t progressString = "\r Evaluating split '%s' [%d/%d]\t" + delta_t sys.stdout.write(progressString % ('val', step + 1, numBatches)) sys.stdout.flush() eval_masked_loss_t = eval_masked_loss_t / float(numBatches) eval_masked_loss_v = eval_masked_loss_v / float(numBatches) eval_next_sentence_loss = eval_next_sentence_loss / float(numBatches) eval_total_loss = eval_total_loss / float(numBatches) printFormat = "Evaluation: [Loss: %.5g][Loss_v: %.5g][Loss_t: %.5g][Loss_n: %.5g]" printInfo = [ eval_total_loss, eval_masked_loss_v, eval_masked_loss_t, eval_next_sentence_loss ] print(printFormat % tuple(printInfo)) torch.set_grad_enabled(True) viz.linePlot(epochId, eval_total_loss, "loss_" + str(rank), "val") viz.linePlot(epochId, eval_masked_loss_t, "masked_loss_t_" + str(rank), "val") viz.linePlot(epochId, eval_masked_loss_v, "masked_loss_v_" + str(rank), "val") viz.linePlot(epochId, eval_next_sentence_loss, "next_sentence_loss_" + str(rank), "val") if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") torch.save(model_to_save.state_dict(), output_model_file) output_opt_state_dict_file = os.path.join( savePath, "optimizer_state_" + str(epochId) + ".bin") torch.save(optimizer.state_dict(), output_opt_state_dict_file) if args.dynamic_masking and epochId + 1 < int(args.num_train_epochs): del train_dataset gc.collect() train_dataset = ConceptCapLoaderTrain( args.train_data_dir, tokenizer, seq_len=args.max_seq_length, batch_size=args.train_batch_size, predict_feature=args.predict_feature, num_workers=args.num_workers, distributed=args.distributed, span_mask=args.span_mask)
def main(args): device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "train.log"), 'w')) else: logger.addHandler(logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) logger.info("device: {}, n_gpu: {}, 16-bits training: {}".format( device, n_gpu, args.fp16)) processor = DataProcessor() label_list = processor.get_labels(args.data_dir, args.negative_label) label2id = {label: i for i, label in enumerate(label_list)} id2label = {i: label for i, label in enumerate(label_list)} num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.do_lower_case) special_tokens = {} if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir, tokenizer) eval_features = convert_examples_to_features( eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode) logger.info("***** Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_span_ids = torch.tensor([f.span_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_span_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) eval_label_ids = all_label_ids if args.do_train: train_examples = processor.get_train_examples(args.data_dir,tokenizer) train_features = convert_examples_to_features( train_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_span_ids = torch.tensor([f.span_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_span_ids) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs logger.info("***** Training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_result = None eval_step = max(1, len(train_batches) // args.eval_per_epoch) lrs = [args.learning_rate] if args.learning_rate else \ [1e-6, 2e-6, 3e-6, 5e-6, 1e-5, 2e-5, 3e-5, 5e-5] for lr in lrs: model = BertForMTB.from_pretrained(args.model,model_name = args.model,num_labels=num_labels,examples = train_examples,mode = args.repre_mode) # BertForMTB.from_pretrained(args.model ,model_name = args.model,num_labels=num_labels,examples = train_examples,mode = args.repre_mode) # BertForSequenceClassification.from_pretrained( # args.model, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE), num_labels=num_labels,examples = train_examples) if args.fp16: model.half() model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: # from apex.optimizers import FP16_Optimizer # from apex.optimizers import FusedAdam from apex.contrib.optimizers import FP16_Optimizer from apex.contrib.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex" "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=lr, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) start_time = time.time() global_step = 0 tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 for epoch in range(int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) if args.train_mode == 'random' or args.train_mode == 'random_sorted': random.shuffle(train_batches) for step, batch in enumerate(train_batches): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, span_ids = batch model_output = model(input_ids, input_mask, segment_ids,span_ids ,label_ids,output_attentions = True,output_hidden_states=True) print(model_output) loss = model_output.loss if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = lr * \ warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if (step + 1) % eval_step == 0: logger.info('Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}'.format( epoch, step + 1, len(train_batches), time.time() - start_time, tr_loss / nb_tr_steps)) save_model = False if args.do_eval: preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels) model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size logger.info("First 20 predictions:") for pred, label in zip(preds[:20], eval_label_ids.numpy()[:20]): sign = u'\u2713' if pred == label else u'\u2718' logger.info("pred = %s, label = %s %s" % (id2label[pred], id2label[label], sign)) if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]): best_result = result save_model = True logger.info("!!! Best dev %s (lr=%s, epoch=%d): %.2f" % (args.eval_metric, str(lr), epoch, result[args.eval_metric] * 100.0)) else: save_model = True if save_model: model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) if best_result: output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_eval: if args.eval_test: eval_examples = processor.get_test_examples(args.data_dir, tokenizer) eval_features = convert_examples_to_features( eval_examples, label2id, args.max_seq_length, tokenizer, special_tokens, args.feature_mode) logger.info("***** Test *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_span_ids = torch.tensor([f.span_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_dataloader = DataLoader(eval_data, batch_size=args.eval_batch_size) eval_label_ids = all_label_ids model = BertForMTB.from_pretrained(args.model,model_name = args.model,num_labels=num_labels,examples = eval_examples,mode = args.repre_mode) # from_pretrained(args.output_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) preds, result = evaluate(model, device, eval_dataloader, eval_label_ids, num_labels) with open(os.path.join(args.output_dir, "predictions.txt"), "w") as f: for ex, pred in zip(eval_examples, preds): f.write("%s\t%s\n" % (ex.guid, id2label[pred])) with open(os.path.join(args.output_dir, "test_results.txt"), "w") as f: for key in sorted(result.keys()): f.write("%s = %s\n" % (key, str(result[key])))
def main(opts): if opts.local_rank == -1: assert torch.cuda.is_available() device = torch.device("cuda") n_gpu = 1 else: torch.cuda.set_device(opts.local_rank) device = torch.device("cuda", opts.local_rank) # Initializes the distributed backend which will take care of # sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') n_gpu = torch.distributed.get_world_size() logger.info("device: {} n_gpu: {}, distributed training: {}, " "16-bits training: {}".format( device, n_gpu, bool(opts.local_rank != -1), opts.fp16)) opts.n_gpu = n_gpu if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) is_master = opts.local_rank == -1 or torch.distributed.get_rank() == 0 # if is_master: # save_training_meta(opts) random.seed(opts.seed) np.random.seed(opts.seed) torch.manual_seed(opts.seed) if n_gpu > 0: torch.cuda.manual_seed_all(opts.seed) tokenizer = BertTokenizer.from_pretrained( opts.bert_model, do_lower_case='uncased' in opts.bert_model) # train_examples = None print("Loading Train Dataset", opts.train_file) vocab_dump = torch.load(opts.vocab_file) print ('vocab dump', vocab_dump) # vocab = vocab_dump['tgt'].fields[0][1].vocab.stoi vocab = vocab_dump['src'].fields[0][1].vocab.stoi # f = open("./vocab_test_tgt.txt", "w") # f.write(str(vocab)) # f.close() # f = open("./vocab_bert.txt", "w") # f.write(str(tokenizer.get_vocab())) # f.close() train_dataset = BertDataset(opts.train_file, tokenizer, vocab, seq_len=opts.max_seq_length, max_len=opts.max_sent_length) print ('train dataset',train_dataset[0]) # Prepare model print ('len train dataset',len(train_dataset)) model = BertForSeq2seq.from_pretrained(opts.bert_model) embedding = convert_embedding( tokenizer, vocab, model.bert.embeddings.word_embeddings.weight) # changing the output embedding layer to have words from new vocab instead of old bert vocab? model.update_output_layer(embedding) if opts.fp16: model.half() model.to(device) if opts.local_rank != -1: # need to make sure models are the same in the beginning params = [p.data for p in model.parameters()] broadcast_tensors(params) for name, module in model.named_modules(): # we might want to tune dropout for smaller dataset if isinstance(module, torch.nn.Dropout): module.p = opts.dropout # Prepare optimizer param_optimizer = [(n, p) for n, p in model.named_parameters() if 'pooler' not in n] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if opts.fp16: try: from apex.contrib.optimizers import FP16_Optimizer from apex.contrib.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex " "to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=opts.learning_rate, bias_correction=False, max_grad_norm=1.0) if opts.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=opts.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=opts.learning_rate) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", opts.train_batch_size) logger.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) logger.info(" Num steps = %d", opts.num_train_steps) if opts.local_rank == -1: train_sampler = TokenBucketSampler( train_dataset.lens, bucket_size=8192, batch_size=opts.train_batch_size, droplast=True) train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=4, collate_fn=BertDataset.pad_collate) else: train_sampler = DistributedTokenBucketSampler( n_gpu, opts.local_rank, train_dataset.lens, bucket_size=8192, batch_size=opts.train_batch_size, droplast=True) train_dataloader = DataLoader(train_dataset, batch_sampler=train_sampler, num_workers=4, collate_fn=BertDataset.pad_collate) if is_master: TB_LOGGER.create(join(opts.output_dir, 'log')) running_loss = RunningMeter('loss') model.train() if is_master: pbar = tqdm(total=opts.num_train_steps) else: logger.disabled = True pbar = None n_examples = 0 n_epoch = 0 start = time() while True: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) if t is not None else t for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch n_examples += input_ids.size(0) mask = lm_label_ids != -1 loss = model(input_ids, segment_ids, input_mask, lm_label_ids, mask, True) if opts.fp16: optimizer.backward(loss) else: loss.backward() running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 if opts.fp16: # modify learning rate with special warm up BERT uses # if opts.fp16 is False, AdamW is used that handles # this automatically lr_this_step = opts.learning_rate * warmup_linear( global_step/opts.num_train_steps, opts.warmup_proportion) if lr_this_step < 0: # save guard for possible miscalculation of train steps lr_this_step = 1e-8 for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # NOTE running loss not gathered across GPUs for speed TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() if opts.local_rank != -1: # gather gradients from every processes grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) optimizer.step() optimizer.zero_grad() if pbar is not None: pbar.update(1) if global_step % 5 == 0: torch.cuda.empty_cache() if global_step % 100 == 0: if opts.local_rank != -1: total = sum(all_gather_list(n_examples)) else: total = n_examples if is_master: ex_per_sec = int(total / (time()-start)) logger.info(f'{total} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar('ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: logger.info(f"start validation at Step {global_step}") with torch.no_grad(): val_log = validate(model, opts.valid_src, opts.valid_tgt, tokenizer, vocab, device, opts.local_rank) logger.info(f"Val Acc: {val_log['val_acc']}; " f"Val Loss: {val_log['val_loss']}") TB_LOGGER.log_scaler_dict(val_log) if is_master: output_model_file = join( opts.output_dir, 'ckpt', f"model_step_{global_step}.pt") # save cpu checkpoint state_dict = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in model.state_dict().items()} torch.save(state_dict, output_model_file) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 if is_master: logger.info(f"finished {n_epoch} epochs") if opts.num_train_steps % opts.valid_steps != 0: with torch.no_grad(): val_log = validate(model, opts.valid_src, opts.valid_tgt, tokenizer, vocab, device, opts.local_rank) TB_LOGGER.log_scaler_dict(val_log) if is_master: output_model_file = join(opts.output_dir, 'ckpt', f"model_step_{global_step}.pt") # save cpu checkpoint state_dict = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in model.state_dict().items()} torch.save(model.state_dict(), output_model_file)