def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def __init__(self, args= None, device='cuda', bert_model_path='bert-base-uncased', batch_size=10, learning_rate = 5e-5, weight_decay=0, additional_features=None): if args is not None: self.args = vars(args) assert device in ['cuda', 'cpu'] if not args: self.args = {} self.args['bert_model_path'] = bert_model_path self.args['device'] = device self.args['learning_rate'] = learning_rate self.args['weight_decay'] = weight_decay self.args['batch_size'] = batch_size self.log = logging.getLogger() self.bert_tokenizer = BertTokenizer.from_pretrained(self.args['bert_model_path']) if os.path.exists(self.args['bert_model_path']): if os.path.exists(os.path.join(self.args['bert_model_path'], CONFIG_NAME)): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], CONFIG_NAME)) elif os.path.exists(os.path.join(self.args['bert_model_path'], 'bert_config.json')): config = BertConfig.from_json_file(os.path.join(self.args['bert_model_path'], 'bert_config.json')) else: raise ValueError("Cannot find a configuration for the BERT model you are attempting to load.") self.loss_function = torch.nn.MSELoss() config.pretrained_config_archive_map['additional_features'] = additional_features self.regressor_net = BertSimilarityRegressor.from_pretrained(self.args['bert_model_path'], config=config) self.optimizer = torch.optim.Adam( self.regressor_net.parameters(), weight_decay=self.args['weight_decay'], lr=self.args['learning_rate'] ) self.log.info('Initialized BertSentencePairSimilarity model from %s' % self.args['bert_model_path'])
def bert_similarity(bert_model_path): bert_tokenizer = BertTokenizer.from_pretrained('D:\GitHub\Image_matching') config = BertConfig.from_json_file('D:\GitHub\Image_matching\config.json') # if os.path.exists(bert_model_path): # if os.path.exists(os.path.join(bert_model_path, CONFIG_NAME)): # config = BertConfig.from_json_file(os.path.join(bert_model_path, CONFIG_NAME)) # elif os.path.exists('D:\GitHub\Image_matching\config.json'): # config = BertConfig.from_json_file('config.json') # else: # raise ValueError("Cannot find a configuration for the BERT model you are attempting to load.") # print(bert_tokenizer) regressor_net = BertSimilarityRegressor.from_pretrained( 'D:\GitHub\Image_matching/', config=config) return bert_tokenizer, regressor_net
def __init__(self, embedding_matrix, opt): super(LCF_GLOVE, self).__init__() self.config = BertConfig.from_json_file("utils/bert_config.json") self.opt = opt self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float)) self.mha_global = SelfAttention(self.config, opt) self.mha_local = SelfAttention(self.config, opt) self.ffn_global = PositionwiseFeedForward(self.opt.embed_dim, dropout=self.opt.dropout) self.ffn_local = PositionwiseFeedForward(self.opt.embed_dim, dropout=self.opt.dropout) self.mha_local_SA = SelfAttention(self.config, opt) self.mha_global_SA = SelfAttention(self.config, opt) self.pool = BertPooler(self.config) self.dropout = nn.Dropout(opt.dropout) self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
def __init__(self, embedding_matrix, opt): super(LCA_GLOVE, self).__init__() # Only few of the parameters are necessary in the config.json, such as hidden_size, num_attention_heads self.config = BertConfig.from_json_file("utils/bert_config.json") self.opt = opt self.embed = nn.Embedding.from_pretrained( torch.tensor(embedding_matrix, dtype=torch.float)) self.lc_embed = nn.Embedding(2, opt.embed_dim) self.global_encoder1 = SelfAttention(self.config, opt) self.local_encoder1 = SelfAttention(self.config, opt) self.local_encoder2 = SelfAttention(self.config, opt) self.mha = SelfAttention(self.config, opt) self.pool = BertPooler(self.config) self.dropout = nn.Dropout(opt.dropout) self.linear = nn.Linear(opt.embed_dim * 2, opt.embed_dim) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim) self.classifier = nn.Linear(opt.embed_dim, 2)
def __init__(self, model_path): super(OnmtBertEncoder, self).__init__() config = BertConfig.from_json_file( os.path.join(model_path, "config.json")) pretrained_dict = os.path.join(model_path, "pytorch_model.bin") if os.path.exists(pretrained_dict): model = BertModel.from_pretrained( pretrained_model_name_or_path=pretrained_dict, config=config) print("init BERT model with {} weights".format( len(model.state_dict()))) else: model = BertModel(config) model.embeddings.word_embeddings = expandEmbeddingByN( model.embeddings.word_embeddings, 4) model.embeddings.word_embeddings = expandEmbeddingByN( model.embeddings.word_embeddings, 2, last=True) self.encoder = model #print(model) print("***" * 20)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help="The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--train_iter_multiplier", default=1.0, type=float, help="multiplier for the multi-task training.", ) parser.add_argument( "--train_iter_gap", default=4, type=int, help="forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available" ) parser.add_argument( "--do_lower_case", default=True, type=bool, help="Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument( "--seed", type=int, default=0, help="random seed for initialization" ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument( "--save_name", default="", type=str, help="save name for training." ) parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument( "--optim", default="AdamW", type=str, help="what to use for the optimization." ) parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not.", ) parser.add_argument( "--evaluation_interval", default=1, type=int, help="evaluate very n epoch." ) parser.add_argument( "--lr_scheduler", default="mannul", type=str, help="whether use learning rate scheduler.", ) parser.add_argument( "--baseline", action="store_true", help="whether use single stream baseline." ) parser.add_argument( "--resume_file", default="", type=str, help="Resume from checkpoint" ) parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) args = parser.parse_args() with open("task_config.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from src.models.basebert import BaseBertForVLTasks else: from src.models.vilbert import BertConfig from src.models.vilbert import VILBertForVLTasks name = task_cfg["name"] task_lr = task_cfg["lr"] base_lr = task_lr loss_scale = task_lr / base_lr if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = ( args.config_file.split("/")[1].split(".")[0] + prefix ) savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r") ) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" ) n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16 ) ) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # load dataset task_batch_size, task_num_iters, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets( args, task_cfg ) logdir = os.path.join(savePath, "logs") tbLogger = utils.tbLogger( logdir, savePath, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_ave_iter = {} task_stop_controller = {} task_ave_iter = int( task_cfg["num_epoch"] * task_num_iters * args.train_iter_multiplier / args.num_train_epochs ) task_stop_controller = utils.TaskStopOnPlateau( mode="max", patience=1, continue_threshold=0.005, cooldown=1, threshold=0.001, ) median_num_iter = task_ave_iter num_train_optimization_steps = ( median_num_iter * args.num_train_epochs // args.gradient_accumulation_steps ) num_labels = task_datasets_train.num_labels if args.dynamic_attention: config.dynamic_attention = True model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = base_lr else: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.0} ] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.01} ] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) # choose optimizer if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, correct_bias=False) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr) # choose scheduler warmpu_steps = args.warmup_proportion * num_train_optimization_steps if args.lr_scheduler == "warmup_linear": warmup_scheduler = WarmupLinearSchedule( optimizer, warmup_steps=warmpu_steps, t_total=num_train_optimization_steps ) else: warmup_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmpu_steps) lr_reduce_list = np.array([5, 7]) if args.lr_scheduler == "automatic": lr_scheduler = ReduceLROnPlateau( optimizer, mode="max", factor=0.2, patience=1, cooldown=1, threshold=0.001 ) elif args.lr_scheduler == "cosine": lr_scheduler = CosineAnnealingLR( optimizer, T_max=median_num_iter * args.num_train_epochs ) elif args.lr_scheduler == "cosine_warm": lr_scheduler = CosineAnnealingWarmRestarts( optimizer, T_0=median_num_iter * args.num_train_epochs ) elif args.lr_scheduler == "mannul": def lr_lambda_fun(epoch): return pow(0.2, np.sum(lr_reduce_list <= epoch)) lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) startIterID = 0 global_step = 0 start_epoch = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace("module.", "", 1)] = checkpoint[ "model_state_dict" ][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) warmup_scheduler.load_state_dict(checkpoint["warmup_scheduler_state_dict"]) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict']) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] start_epoch = int(checkpoint["epoch_id"]) + 1 task_stop_controller = checkpoint["task_stop_controller"] tbLogger = checkpoint["tb_logger"] del checkpoint model.to(device) print("`==============`MODEL=============") print(next(model.parameters()).is_cuda)#False for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) print(" Num steps: %d" % num_train_optimization_steps) task_iter_train = None task_count = 0 for epochId in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch", ncols=100): model.train() for step in range(median_num_iter): iterId = startIterID + step + (epochId * median_num_iter) first_task = True is_forward = False if (not task_stop_controller.in_stop) or ( iterId % args.train_iter_gap == 0 ): is_forward = True if is_forward: loss, score = ForwardModelsTrain( args, task_cfg, device, task_count, task_iter_train, task_dataloader_train, model, task_losses, ) loss = loss * loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step if first_task and ( global_step < warmpu_steps or args.lr_scheduler == "warmup_linear" ): warmup_scheduler.step() optimizer.step() model.zero_grad() if first_task: global_step += 1 first_task = False if default_gpu: tbLogger.step_train( epochId, iterId, float(loss), float(score), optimizer.param_groups[0]["lr"], "train", ) if "cosine" in args.lr_scheduler and global_step > warmpu_steps: lr_scheduler.step() if ( step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu ): tbLogger.showLossTrain() # decided whether to evaluate on SNLI tasks. if (iterId != 0 and iterId % task_num_iters == 0) or ( epochId == args.num_train_epochs - 1 and step == median_num_iter - 1 ): evaluate( args, task_dataloader_val, task_stop_controller, task_cfg, device, model, task_losses, epochId, default_gpu, tbLogger, ) if args.lr_scheduler == "automatic": lr_scheduler.step(sum(val_scores.values())) logger.info("best average score is %3f" % lr_scheduler.best) elif args.lr_scheduler == "mannul": lr_scheduler.step() if epochId in lr_reduce_list: # reset the task_stop_controller once the lr drop task_stop_controller._reset() if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin" ) output_checkpoint = os.path.join(savePath, "pytorch_ckpt_latest.tar") torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "warmup_scheduler_state_dict": warmup_scheduler.state_dict(), # 'lr_scheduler_state_dict': lr_scheduler.state_dict(), "global_step": global_step, "epoch_id": epochId, "task_stop_controller": task_stop_controller, "tb_logger": tbLogger, }, output_checkpoint, ) tbLogger.txt_close()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="results", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=0, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--use_chunk", default=0, type=float, help="whether use chunck for parallel training.", ) parser.add_argument("--batch_size", default=1, type=int, help="what is the batch size?") parser.add_argument("--tasks", default="", type=str, help="1-2-3... training task separate by -") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--split", default="", type=str, help="which split to use.") parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=False, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--task_specific_tokens", action="store_true", help="whether to use task specific tokens for the multi-task learning.", ) args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) # if args.task_specific_tokens: # config.task_specific_tokens = True # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) if args.task_specific_tokens: config.task_specific_tokens = True if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val = LoadDatasetEval( args, task_cfg, args.tasks.split("-")) tbLogger = utils.tbLogger( timeStamp, savePath, task_names, task_ids, task_num_iters, 1, save_logger=False, txt_name="eval.txt", ) num_labels = max( [dataset.num_labels for dataset in task_datasets_val.values()]) if args.dynamic_attention: config.dynamic_attention = True if "roberta" in args.bert_model: config.model = "roberta" if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if args.task_specific_tokens: config.task_specific_tokens = True # set visualization to true config.visualization = True # uncomment this to use sum fusion # config.fusion_method = "sum" if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) model.to(device) # if args.local_rank != -1: # try: # from apex.parallel import DistributedDataParallel as DDP # except ImportError: # raise ImportError( # "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." # ) # model = DDP(model, delay_allreduce=True) # # elif n_gpu > 1: # model = nn.DataParallel(model) print("***** Running evaluation *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) print(task_ids) model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: results = [] others = [] for i, batch in enumerate(task_dataloader_val[task_id]): loss, score, batch_size, results, others = EvaluatingModel( args, task_cfg, device, task_id, batch, model, task_dataloader_val, task_losses, results, others, i) tbLogger.step_val(0, float(loss), float(score), task_id, batch_size, "val") sys.stdout.write("%d/%d\r" % (i, len(task_dataloader_val[task_id]))) sys.stdout.flush() # save the result or evaluate the result. ave_score = tbLogger.showLossVal(task_id) if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]["val_split"]) print(json_path) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w"))
from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) if args.task_specific_tokens: config.task_specific_tokens = True if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl")
def main(): # os.environ['C UDA_VISIBLE_DEVICES'] = "0,1" batch_size = 64 parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--train_iter_multiplier", default=1.0, type=float, help="multiplier for the multi-task training.", ) parser.add_argument( "--train_iter_gap", default=4, type=int, help= "forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=0, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--optim", default="AdamW", type=str, help="what to use for the optimization.") parser.add_argument("--tasks", default="0", type=str, help="discourse : TASK0") parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not.", ) parser.add_argument("--evaluation_interval", default=1, type=int, help="evaluate very n epoch.") parser.add_argument( "--lr_scheduler", default="mannul", type=str, help="whether use learning rate scheduler.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--resume_file", default="", type=str, help="Resume from checkpoint") parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--task_specific_tokens", action="store_true", default=False, help="whether to use task specific tokens for the multi-task learning.", ) # todo args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] task_lr = [] task_id = 1 for i, task_id in enumerate(args.tasks.split("-")): task_id = str(1) task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) task_lr.append(task_cfg[task]["lr"]) base_lr = min(task_lr) loss_scale = {} for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id loss_scale[task] = task_lr[i] / base_lr if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = ("-".join("discourse") + "_" + args.config_file.split("/")[1].split(".")[0] + prefix) savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) # task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets( # args, task_cfg, args.tasks.split("-"),'train' # ) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) labels = [ "Visible", 'Subjective', 'Action', 'Story', 'Meta', 'Irrelevant', 'Other' ] train_dataset = DiscourseRelationDataset( labels, task_cfg[task]["dataroot"], tokenizer, args.bert_model, task_cfg[task]["max_seq_length"], encoding="utf-8", visual_target=0, batch_size=batch_size, shuffle=False, num_workers=4, cache=5000, drop_last=False, cuda=False, objective=0, visualization=False, ) train_sampler = RandomSampler(train_dataset) train_loader = DataLoader( train_dataset, sampler=train_sampler, batch_size=batch_size, num_workers=0, pin_memory=True, ) # for i in train_loader: # print("hello") # todo task_ids , task_num_tiers task_ids = ['TASK0'] task_num_iters = [100] task_batch_size = task_cfg['TASK0']["batch_size"] print("task_batch_size") print(task_batch_size) logdir = os.path.join(savePath, "logs") tbLogger = utils.tbLogger( logdir, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if args.task_specific_tokens: print("*********** config.task_specific_tokens = True ************") config.task_specific_tokens = True if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_ave_iter = {} task_stop_controller = {} # for task_id, num_iter in task_num_iters.items(): # task_ave_iter[task_id] = int( # task_cfg[task]["num_epoch"] # * num_iter # * args.train_iter_multiplier # / args.num_train_epochs # ) # task_stop_controller[task_id] = utils.MultiTaskStopOnPlateau( # mode="max", # patience=1, # continue_threshold=0.005, # cooldown=1, # threshold=0.001, # ) # task_ave_iter_list = sorted(task_ave_iter.values()) # median_num_iter = task_ave_iter_list[-1] # num_train_optimization_steps = ( # median_num_iter * args.num_train_epochs // args.gradient_accumulation_steps # ) # num_labels = max([dataset.num_labels for dataset in task_datasets_train.values()]) # num_train_optimization_steps = int( # train_dataset.num_dataset # / task_batch_size # / args.gradient_accumulation_steps # ) * (args.num_train_epochs - args.start_epoch) # num_train_optimization_steps = int( # train_dataset.num_dataset # / task_batch_size # / args.gradient_accumulation_steps # ) * (args.num_train_epochs - args.start_epoch) num_train_optimization_steps = 10 num_labels = len(labels) if args.dynamic_attention: config.dynamic_attention = True if "roberta" in args.bert_model: config.model = "roberta" if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) model.double() model = model.to(device) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = base_lr else: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, correct_bias=False, weight_decay=1e-4) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr, weight_decay=1e-4) # warmpu_steps = args.warmup_proportion * num_train_optimization_steps # if args.lr_scheduler == "warmup_linear": # warmup_scheduler = WarmupLinearSchedule( # optimizer, warmup_steps=warmpu_steps, t_total=num_train_optimization_steps # ) # else: # warmup_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmpu_steps) # # lr_reduce_list = np.array([5, 7]) # if args.lr_scheduler == "automatic": # lr_scheduler = ReduceLROnPlateau( # optimizer, mode="max", factor=0.2, patience=1, cooldown=1, threshold=0.001 # ) # elif args.lr_scheduler == "cosine": # lr_scheduler = CosineAnnealingLR( # optimizer, T_max=median_num_iter * args.num_train_epochs # ) # elif args.lr_scheduler == "cosine_warm": # lr_scheduler = CosineAnnealingWarmRestarts( # # optimizer, T_0=median_num_iter * args.num_train_epochs # ) # elif args.lr_scheduler == "mannul": # # def lr_lambda_fun(epoch): # return pow(0.2, np.sum(lr_reduce_list <= epoch)) # # lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) startIterID = 0 global_step = 0 start_epoch = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace( "module.", "", 1)] = checkpoint["model_state_dict"][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) # warmup_scheduler.load_state_dict(checkpoint["warmup_scheduler_state_dict"]) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict']) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] start_epoch = int(checkpoint["epoch_id"]) + 1 task_stop_controller = checkpoint["task_stop_controller"] tbLogger = checkpoint["tb_logger"] del checkpoint model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", batch_size) print(" Num steps: %d" % num_train_optimization_steps) task_iter_train = {name: None for name in task_ids} task_count = {name: 0 for name in task_ids} # for epochId in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch"): # model.train() # torch.autograd.set_detect_anomaly(True) # # for step in range(median_num_iter): # for step in range(1) # # iterId = startIterID + step + (epochId * median_num_iter) # first_task = True # for task_id in task_ids: # is_forward = False # # if (not task_stop_controller[task_id].in_stop) or ( # # iterId % args.train_iter_gap == 0 # # ): # args['start_epoch'] = 0 # args.num_train_epochs criterion = nn.BCEWithLogitsLoss() target_path = os.path.join(task_cfg[task]["dataroot"], "all_targets_json.json") all_targets = json.load(open(target_path, "r")) model = model.to(device) print(next(model.parameters()).is_cuda) for epochId in range(int(start_epoch), int(args.num_train_epochs)): model.train() is_forward = True if is_forward: # print("beforeLoop") # loss, score = ForwardModelsTrain( # args, # task_cfg, # device, # task_id, # task_count, # task_iter_train, # train_dataset, # model, # task_losses, # ) for step, batch in enumerate(train_loader): batch = tuple( t.to(device=device, non_blocking=True) if type(t) == torch.Tensor else t for t in batch) input_ids, input_mask, segment_ids, image_feat, image_loc, image_mask, image_id = ( batch) true_targets = [] for id in image_id: true_targets.append( np.fromiter(all_targets[id].values(), dtype=np.double)) true_targets = torch.from_numpy(np.array(true_targets)) true_targets = true_targets.to(device) model.double() model = model.to(device) discourse_prediction, vil_prediction, vil_prediction_gqa, vil_logit, vil_binary_prediction, vil_tri_prediction, vision_prediction, vision_logit, linguisic_prediction, linguisic_logit, _ \ = model( True, input_ids, image_feat, image_loc, segment_ids, input_mask, image_mask ) loss = criterion(discourse_prediction, true_targets.type(torch.double)) loss.backward() optimizer.step() model.zero_grad() print("train train train done") # print("*********** ITERATION {} ***********".format(epochId)) print("*********** TRAIN PERFORMANCE ***********") print(loss) print( compute_score(discourse_prediction.to('cpu'), true_targets.type(torch.float).to('cpu'), 0.5)) print("*********** TEST PERFORMANCE ***********") evaluate(model, device, task_cfg, tokenizer, args, labels)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument("--config_file", type=str, required=True, help="Bert config file.") parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model") parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--sparse_optim", action="store_true") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if os.path.isfile(args.config_file): config = BertConfig.from_json_file(args.config_file) config.mem_sparse = args.sparse_optim model = BertForPreTraining(config) elif os.path.isdir(args.config_file): model = BertForPreTraining.from_pretrained(args.config_file) else: raise Exception( "config_file must be either a path to the config_file or a dir where the config file is and a model checkpoint" ) model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] use_memory = hasattr(config, 'memory_layer_place') if args.sparse_optim and use_memory: memory_params = ['memory.values.weight'] else: memory_params = [] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(nd in n for nd in memory_params) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) and not any(nd in n for nd in memory_params) ], 'weight_decay': 0.0 }] if use_memory: memory_optimizer_parameters = [{ 'params': [ p for n, p in param_optimizer if any(nd in n for nd in memory_params) ], 'weight_decay': 0.0 }] optimizers = [] schedulers = [] optimizers.append( AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)) schedulers.append( WarmupLinearSchedule(optimizers[0], warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)) if memory_params: optimizers.append( SparseAdam(memory_optimizer_parameters, lr=args.learning_rate, eps=args.adam_epsilon)) schedulers.append( WarmupLinearSchedule(optimizers[1], warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizers = amp.initialize(model, optimizers, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) logging.info(f" Model size is {num_parameters}") logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizers) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: for scheduler in schedulers: scheduler.step() # Update learning rate schedule for optimizer in optimizers: optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model if n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <= 1: logging.info("** ** * Saving fine-tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default='save/multitask_model/multi_task_model.bin', type=str, help="path to the fine-tuned model", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--output_dir", default="results", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--use_chunk", default=0, type=float, help="whether use chunck for parallel training.", ) parser.add_argument("--tasks", default="8", type=str, help="1-2-3... training task separate by -") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--zero_shot", action="store_true", help="whether use single stream baseline.") parser.add_argument("--split", default="", type=str, help="which split to use.") parser.add_argument("--batch_size", default=1, type=int, help="which split to use.") parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--task_specific_tokens", action="store_true", help="whether to use task specific tokens for the multi-task learning.", ) args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig else: from vilbert.vilbert import BertConfig task_names = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] if "/" in args.from_pretrained: timeStamp = args.from_pretrained.split("/")[1] else: timeStamp = args.from_pretrained savePath = os.path.join(args.output_dir, timeStamp) config = BertConfig.from_json_file(args.config_file) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu and not os.path.exists(savePath): os.makedirs(savePath) task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val = LoadDatasetEval( args, task_cfg, args.tasks.split("-")) num_labels = max( [dataset.num_labels for dataset in task_datasets_val.values()]) if args.task_specific_tokens: config.task_specific_tokens = True config.fast_mode = True if args.zero_shot: model = BertForMultiModalPreTraining.from_pretrained( args.from_pretrained, config) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) torch.cuda.empty_cache() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, deay_allreduce=True) elif n_gpu > 1: model = nn.DataParallel(model) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: results = [] others = [] noimg = len(task_datasets_val[task_id]._image_entries) index = int(noimg / 2) nocap = len(task_datasets_val[task_id]._caption_entries) score_matrix = np.zeros((nocap, noimg)) # 5000,1000 target_matrix = np.zeros((nocap, noimg)) rank_matrix = np.ones((nocap)) * noimg count = 0 for i, batch in enumerate(task_dataloader_val[task_id]): batch = tuple( t.cuda(device=device, non_blocking=True) for t in batch) features, spatials, image_mask, question, input_mask, segment_ids, target, caption_idx, image_idx = ( batch) task_tokens = (question.new().resize_(question.size(0), 1).fill_(int(task_id[4:]))) batch_size = features.size(0) features = features.squeeze(0) spatials = spatials.squeeze(0) image_mask = image_mask.squeeze(0) with torch.no_grad(): if args.zero_shot: vil_logit = model( question, features, spatials, segment_ids, input_mask, image_mask, task_ids=task_tokens, ) score_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = (torch.softmax( vil_logit, dim=1)[:, 0].view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * 500:(image_idx + 1) * 500] = ( target.view(-1).float().cpu().numpy()) else: vil_logit = model( question, features, spatials, segment_ids, input_mask, image_mask, task_ids=task_tokens, ) if image_idx == 0: score_matrix[caption_idx, image_idx * index:(image_idx + 1) * index] = ( vil_logit.view(-1).cpu().numpy()) target_matrix[ caption_idx, image_idx * index:(image_idx + 1) * index] = (target.view(-1).float().cpu().numpy()) else: score_matrix[caption_idx, image_idx * index:] = ( vil_logit.view(-1).cpu().numpy()) target_matrix[caption_idx, image_idx * index:] = ( target.view(-1).float().cpu().numpy()) if image_idx.item() == 1: _, preds = torch.max( torch.from_numpy( score_matrix[caption_idx]).unsqueeze(0), 1) rank = np.where( (np.argsort(-score_matrix[caption_idx]) == np.where( target_matrix[caption_idx] == 1)[0][0]) == 1)[0][0] rank_matrix[caption_idx] = rank rank_matrix_tmp = rank_matrix[:caption_idx + 1] r1 = 100.0 * np.sum( rank_matrix_tmp < 1) / len(rank_matrix_tmp) r5 = 100.0 * np.sum( rank_matrix_tmp < 5) / len(rank_matrix_tmp) r10 = 100.0 * np.sum( rank_matrix_tmp < 10) / len(rank_matrix_tmp) medr = np.floor(np.median(rank_matrix_tmp) + 1) meanr = np.mean(rank_matrix_tmp) + 1 print( "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (count, r1, r5, r10, medr, meanr)) results.append( np.argsort(-score_matrix[caption_idx]).tolist()[:20]) count += 1 r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix) r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix) r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix) medr = np.floor(np.median(rank_matrix) + 1) meanr = np.mean(rank_matrix) + 1 print("************************************************") print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" % (r1, r5, r10, medr, meanr)) print("************************************************") if args.split: json_path = os.path.join(savePath, args.split) else: json_path = os.path.join(savePath, task_cfg[task_id]["val_split"]) json.dump(results, open(json_path + "_result.json", "w")) json.dump(others, open(json_path + "_others.json", "w"))
def __init__(self, args=None, labels=None, device='cuda', bert_model_path='bert-base-uncased', architecture="DocumentBertLSTM", batch_size=10, bert_batch_size=7, learning_rate=5e-5, weight_decay=0, use_tensorboard=False): if args is not None: self.args = vars(args) if not args: self.args = {} self.args['bert_model_path'] = bert_model_path self.args['device'] = device self.args['learning_rate'] = learning_rate self.args['weight_decay'] = weight_decay self.args['batch_size'] = batch_size self.args['labels'] = labels self.args['bert_batch_size'] = bert_batch_size self.args['architecture'] = architecture self.args['use_tensorboard'] = use_tensorboard if 'fold' not in self.args: self.args['fold'] = 0 assert self.args[ 'labels'] is not None, "Must specify all labels in prediction" self.log = logging.getLogger() self.bert_tokenizer = BertTokenizer.from_pretrained( self.args['bert_model_path']) #account for some random tensorflow naming scheme if os.path.exists(self.args['bert_model_path']): if os.path.exists( os.path.join(self.args['bert_model_path'], CONFIG_NAME)): config = BertConfig.from_json_file( os.path.join(self.args['bert_model_path'], CONFIG_NAME)) elif os.path.exists( os.path.join(self.args['bert_model_path'], 'bert_config.json')): config = BertConfig.from_json_file( os.path.join(self.args['bert_model_path'], 'bert_config.json')) else: raise ValueError( "Cannot find a configuration for the BERT based model you are attempting to load." ) else: config = BertConfig.from_pretrained(self.args['bert_model_path']) config.__setattr__('num_labels', len(self.args['labels'])) config.__setattr__('bert_batch_size', self.args['bert_batch_size']) if 'use_tensorboard' in self.args and self.args['use_tensorboard']: assert 'model_directory' in self.args is not None, "Must have a logging and checkpoint directory set." from torch.utils.tensorboard import SummaryWriter self.tensorboard_writer = SummaryWriter( os.path.join( self.args['model_directory'], "..", "runs", self.args['model_directory'].split(os.path.sep)[-1] + '_' + self.args['architecture'] + '_' + str(self.args['fold']))) self.bert_doc_classification = document_bert_architectures[ self.args['architecture']].from_pretrained( self.args['bert_model_path'], config=config) self.bert_doc_classification.freeze_bert_encoder() self.bert_doc_classification.unfreeze_bert_encoder_last_layers() self.optimizer = torch.optim.Adam( self.bert_doc_classification.parameters(), weight_decay=self.args['weight_decay'], lr=self.args['learning_rate'])