def init_pytorch_model(model_name, tf_checkpoint_path): config_name = TFMODELS[model_name][1] config_module = __import__("transformers", fromlist=[config_name]) model_config = getattr(config_module, config_name) parent_path = tf_checkpoint_path.rpartition('/')[0] config_path = glob.glob(parent_path + "/*config.json") config = model_config() if len(config_path) is 0 else model_config.from_json_file(str(config_path[0])) if TFMODELS[model_name][2] is "": from transformers import AutoModelForPreTraining init_model = AutoModelForPreTraining.from_config(config) else: model_categroy_name = TFMODELS[model_name][2] module = __import__("transformers", fromlist=[model_categroy_name]) model_categroy = getattr(module, model_categroy_name) init_model = model_categroy(config) return config, init_model
def prepare_model_and_optimizer(args, device): global_step = 0 args.resume_step = 0 checkpoint = None # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") config.dense_seq_output = args.dense_seq_output if args.model_name_or_path: model = AutoModelForPreTraining.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForPreTraining.from_config(config) ## Load from Pyt checkpoint - either given as init_checkpoint, or picked up from output_dir if found #if args.init_checkpoint is not None or found_resume_checkpoint(args): # # Prepare model # #model = BertForPreTraining(config) # model = BertForPreTrainingSegmented(config) # # for k,v in model.state_dict().items(): # # print(f'model-k,len(v)={k}, {v.numel()}') # #model = BertForPretraining(config) # if args.init_checkpoint is None: # finding checkpoint in output_dir # assert False, "code path not tested with cuda graphs" # checkpoint_str = "phase2_ckpt_*.pt" if args.phase2 else "phase1_ckpt_*.pt" # model_names = [f for f in glob.glob(os.path.join(args.output_dir, checkpoint_str))] # global_step = max([int(x.split('.pt')[0].split('_')[-1].strip()) for x in model_names]) # args.resume_step = global_step #used for throughput computation # resume_init_checkpoint = os.path.join(args.output_dir, checkpoint_str.replace("*", str(global_step))) # print("Setting init checkpoint to %s - which is the latest in %s" %(resume_init_checkpoint, args.output_dir)) # checkpoint=torch.load(resume_init_checkpoint, map_location="cpu") # else: # checkpoint=torch.load(args.init_checkpoint, map_location="cpu")["model"] param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay_rate}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate, betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2), fused=True) if args.warmup_steps == 0: warmup_steps = int(args.max_steps * args.warmup_proportion) warmup_start = 0 else: warmup_steps = args.warmup_steps warmup_start = args.start_warmup_step lr_scheduler = LinearWarmupPolyDecayScheduler(optimizer, start_warmup_steps=warmup_start, warmup_steps=warmup_steps, total_steps=args.max_steps, end_learning_rate=0.0, degree=1.0) #if found_resume_checkpoint(args): # assert False, "code path not tested with cuda graphs" # optimizer.load_state_dict(checkpoint['optimizer']) #restores m,v states (only if resuming checkpoint, not for init_checkpoint and init_tf_checkpoint for now) return model, optimizer, lr_scheduler, checkpoint, global_step