# Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) if model_type == "gpt2": # the huggingface servers had temporary problems to serve the tokenizer, so we saved it for # offline use - replacing "gpt2_offline" with "gpt2" will download the latest again. The one # saved for offline use should be identical to the regular one. tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") tokenizer.add_special_tokens({"pad_token": "[PAD]"}) config = GPT2Config() model = GPT2LMHeadModel(config) model.resize_token_embeddings(len(tokenizer)) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) if train: train_dataset = datasets.Dataset.load_from_disk(os.path.join(data_dir, "lm_train")) elif model_type == "bert": dataset_properties = json.load(open(os.path.join(data_dir, "dataset_properties.json"))) special_tokens = dataset_properties["special_tokens"] tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") tokenizer.add_special_tokens({"additional_special_tokens": special_tokens}) config = BertConfig()
def __init__( self, model: str = None, config: Union[str, GPT2Config] = None, vocab_file: str = None, merges_file: str = None, cache_dir: str = "aitextgen", tf_gpt2: str = None, to_gpu: bool = False, to_fp16: bool = False, verbose: bool = False, torchscript: bool = False, ts_to_trace: bool = False, bos_token: str = None, eos_token: str = None, unk_token: str = None, **kwargs, ) -> None: if not verbose: for module in [ "transformers.file_utils", "transformers.configuration_utils", "transformers.tokenization_utils", "filelock", "transformers.modeling_gpt2", ]: logging.getLogger(module).setLevel(logging.WARN) logging.getLogger("transformers.modeling_utils").setLevel( logging.ERROR) if torchscript: assert model logger.info(f"Loading traced GPT-2 model from provided {model}.") if config is None: config = GPT2Config() self.torchscript = True self.model = GPT2LMHeadModel(config) # Transpose the traced model attributes to a GPT2LMHeadModel class # so it can inherit its functions pt_model = torch.jit.load(model) self.model.transformer = pt_model.transformer self.model.lm_head = pt_model.lm_head elif tf_gpt2: # Download + convert the TF weights if a PyTorch model has not been created if not os.path.isfile( os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")): assert tf_gpt2 in [ "124M", "355M", "774M", "1558M", ], "Invalid TensorFlow GPT-2 model size." logger.info( f"Downloading the {tf_gpt2} GPT-2 TensorFlow weights/config " + "from Google's servers") download_gpt2(cache_dir, tf_gpt2) logger.info( f"Converting the {tf_gpt2} GPT-2 TensorFlow weights to PyTorch." ) config_path = os.path.join(cache_dir, tf_gpt2, "hparams.json") convert_gpt2_checkpoint_to_pytorch( os.path.join(cache_dir, tf_gpt2), config_path, cache_dir, ) os.rename( os.path.join(cache_dir, "pytorch_model.bin"), os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin"), ) os.rename( os.path.join(cache_dir, "config.json"), os.path.join(cache_dir, f"config_{tf_gpt2}.json"), ) logger.info(f"Loading {tf_gpt2} GPT-2 model from /{cache_dir}.") model = os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin") config = os.path.join(cache_dir, f"config_{tf_gpt2}.json") self.model = GPT2LMHeadModel.from_pretrained(model, config=config) elif model and os.path.exists(model): # A pytorch_model.bin (+ optional config/config.json) is provided logger.info(f"Loading GPT-2 model from provided {model}.") if config is None: config = GPT2Config() if ts_to_trace: config.torchscript = True self.model = GPT2LMHeadModel.from_pretrained(model, config=config) elif config: if ts_to_trace: config.torchscript = True # Manually construct a GPT-2 model from scratch logger.info("Constructing GPT-2 model from provided config.") self.model = AutoModelWithLMHead.from_config(config=config) else: # Download and cache model from Huggingface if os.path.isdir(cache_dir) and len(os.listdir(cache_dir)) > 0: logger.info( f"Loading {model or 'gpt2'} model from /{cache_dir}.") else: logger.info( f"Downloading {model or 'gpt2'} model to /{cache_dir}.") self.model = GPT2LMHeadModel.from_pretrained( model or "gpt2", cache_dir=cache_dir, torchscript=ts_to_trace) if model and "gpt2" not in model: logger.info(f"Using the tokenizer for {model}.") self.tokenizer = GPT2Tokenizer.from_pretrained( model, cache_dir=cache_dir, ) if self.tokenizer is None: # Update tokenizer settings (if not set already) args = locals() custom_tokenizer = False for attr in [ "vocab_file", "merges_file", "bos_token", "eos_token", "unk_token", ]: if args[attr] is not None: custom_tokenizer = True setattr(self, attr, args[attr]) if custom_tokenizer: logger.info("Using a custom tokenizer.") else: logger.info("Using the default GPT-2 Tokenizer.") self.tokenizer = GPT2Tokenizer( vocab_file=self.vocab_file, merges_file=self.merges_file, bos_token=self.bos_token, eos_token=self.eos_token, unk_token=self.unk_token, pad_token=self.pad_token, ) if to_gpu: if to_fp16: self.to_fp16() self.to_gpu()
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = random_attention_mask( [self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) mc_token_ids = None if self.use_mc_token_ids: mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = GPT2Config( vocab_size=self.vocab_size, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, # intermediate_size=self.intermediate_size, # hidden_act=self.hidden_act, # hidden_dropout_prob=self.hidden_dropout_prob, # attention_probs_dropout_prob=self.attention_probs_dropout_prob, n_positions=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, pad_token_id=self.pad_token_id, return_dict=True, ) head_mask = ids_tensor( [self.num_hidden_layers, self.num_attention_heads], 2) return ( config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels, )
small_conv=vq_vae_small_conv, embedding_dim=vq_vae_embedding_dim, num_embeddings=vq_vae_num_embeddings, commitment_cost=vq_vae_commitment_cost, use_max_filters=vq_vae_use_max_filters, max_filters=vq_vae_max_filters, ) vq_vae.load_state_dict(torch.load(vq_vae_model_path, map_location=device)) vq_vae.eval() vq_vae.to(device) # Create Model configuration = GPT2Config( vocab_size=vocab_size, n_positions=max_seq_length, n_embd=embedding_size, n_layer=num_hidden_layers, n_head=num_attention_heads, resid_pdrop=resid_pdrop, ) model = GPT2LMHeadModel(configuration) model.eval() model.load_state_dict(checkpoint["model_state_dict"]) model.to(device) print(model) with torch.no_grad(): # Get most common pixel values to feed into generation script _, _, _, encodings = vq_vae(sample.to(device)) encodings, counts = encodings.unique(return_counts=True) bg1, bg2 = encodings[counts.topk(k=2, largest=True).indices].cpu().numpy()
def run_model(): parser = argparse.ArgumentParser() parser.add_argument('--model-path', type=str, help='pretrained model path to local checkpoint') parser.add_argument("--seed", type=int, default=0) parser.add_argument("--nsamples", type=int, default=1) parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--length", type=int, default=-1) parser.add_argument("--temperature", type=int, default=0.95) parser.add_argument('--top_p', type=float, default=0.95) parser.add_argument('--top_k', type=int, default=100) parser.add_argument('--data-dir', type=str, default='data') parser.add_argument('--out-dir', type=str, default='out') parser.add_argument('--data_type', type=str, default='t1', choices=['t' + str(i) for i in range(9)], help="t: type") parser.add_argument('--model_type', type=str, default='cvae', choices=['cvae', 'ae_vae_fusion']) parser.add_argument('--dataset', type=str, default='wi', choices=['wp', 'wi'], help="Dataset to use for training") # use GPU parser.add_argument('--gpu', default=2, type=int) parser.add_argument('--no_gpu', action="store_true") parser.add_argument('--add_input', action="store_true") parser.add_argument('--add_attn', action="store_true") parser.add_argument('--add_softmax', action="store_true") parser.add_argument('--attn_proj_vary', action="store_true") parser.add_argument('--learn_prior', action="store_true") args = parser.parse_args('--model-path out/wi.1.proj_vary_cyc_cvae/model_0030000.pt ' '--add_input --learn_prior '.split()) print(args) if args.model_type == 'cvae': args.learn_prior = True else: args.learn_prior = False # GPU if not torch.cuda.is_available(): args.no_gpu = True gpu = not args.no_gpu if gpu: torch.cuda.set_device(args.gpu) device = torch.device(args.gpu if gpu else "cpu") # randomness np.random.seed(args.seed) prng = np.random.RandomState() torch.random.manual_seed(args.seed) if gpu: torch.cuda.manual_seed(args.seed) if args.batch_size == -1: args.batch_size = 1 assert args.nsamples % args.batch_size == 0 # logging save_folder = args.model_path + '.eval/' os.makedirs(save_folder, exist_ok=True) importlib.reload(logging) logging.basicConfig(filename=os.path.join(save_folder, 'eval.log'), level=logging.INFO, format='%(asctime)s--- %(message)s') logging.info('\n----------------------------------------------------------------------') print('Loading models...') cache_dir = os.path.join(args.out_dir, 'model_cache') os.makedirs(cache_dir, exist_ok=True) # Load pre-trained teacher tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=cache_dir) tokenizer.max_len = int(1e12) gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=cache_dir) print('gpt2_params:', num_params(gpt2_model)) # gpt2: 124439808 config = GPT2Config() # # add special tokens # special_tokens_dict = { # 'pad_token': '<|startoftext|>', # 'cls_token': '<|startofcond|>', # 'sep_token': '<|sepofcond|>', # 'mask_token': '<|endofcond|>' # } # num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) # print('We have added', num_added_toks, 'special tokens') # # Notice: resize_token_embeddings expect to receive the full size of the new vocab # gpt2_model.resize_token_embeddings(len(tokenizer)) # assert tokenizer.pad_token == '<|startoftext|>' VAE = VAEModel(config, add_input=args.add_input, add_attn=args.add_attn, add_softmax=args.add_softmax, attn_proj_vary=args.attn_proj_vary, learn_prior=args.learn_prior) init_para_frompretrained(VAE.transformer, gpt2_model.transformer, share_para=True) init_para_frompretrained(VAE.encoder, gpt2_model.transformer, share_para=False) if args.learn_prior: init_para_frompretrained(VAE.encoder_prior, VAE.encoder, share_para=True) VAE.encoder_prior.averageSelfAttention.attention_weights = VAE.encoder.averageSelfAttention.attention_weights VAE.lm_head.weight = gpt2_model.lm_head.weight if VAE.add_softmax: VAE.lm_head_rep = Conv1D(*gpt2_model.lm_head.weight.size()) # VAE.lm_head_rep = LM_head_rep(*gpt2_model.lm_head.weight.size()[::-1]) print('VAE_params:', num_params(VAE)) # 286694400 args.load = args.model_path if args.load: print('Loading model weights...') state = torch.load(os.path.join(args.load), map_location='cpu') if 'module' in list(state.keys())[0]: # model_path is data parallel model with attr 'module' state_copy = copy.copy(state) keys = state_copy.keys() for k in keys: state[k.replace('module.', '')] = state.pop(k) VAE.load_state_dict(state) gc.collect() print('Model loaded.') print('Setup data...') seq_len = VAE.config.n_ctx test_loader = prepare_dataset( args.data_dir, args.dataset, tokenizer, 1, seq_len, 1, seq_len, args.batch_size, seq_len, make_train=False, make_val=False, make_test=True, data_type=args.data_type )[0] print('Done.') VAE.eval() # be careful about VAE.eval() vs VAE.train() VAE.to(device) loss_fn = nn.CrossEntropyLoss(reduction='none') logging.info('\n----------------------------------------------------------------------') logging.info("Testing loop. batches: %d" % len(test_loader)) endoftext = tokenizer.convert_tokens_to_ids("<|endoftext|>") startofcond = tokenizer.convert_tokens_to_ids("<|startofcond|>") endofcond = tokenizer.convert_tokens_to_ids("<|endofcond|>") n_samples = 0 bleu4_sum = 0.0 rouge_scores_values_sum = [0.0] * 9 model_type = args.model_type # test_iter = iter(test_loader); x_mask, x_tokens, y_mask, y_tokens, input_tokens, target_tokens, mask = next(test_iter) with tqdm(total=len(test_loader)) as pbar: for i_test, (x_mask, x_tokens, y_mask, y_tokens, input_tokens, target_tokens, mask) in enumerate(test_loader): length = args.length if length == -1: length = VAE.config.n_ctx - x_tokens.size(1) - 1 elif length > VAE.config.n_ctx - x_tokens.size(1) - 1: raise ValueError("Can't get samples longer than window size: %s" % VAE.config.n_ctx) eff_samples = [] n, l = target_tokens.size() storys = [tokenizer.decode(target_tokens[i, :]) for i in range(n)] storys = [s[s.find("<|endoftext|>") + len("<|endoftext|>"):] for s in storys] storys_str = [s[:s.find("<|endoftext|>") + len("<|endoftext|>")] if "<|endoftext|>" in s else s for s in storys] for _ in range(args.nsamples // args.batch_size): # model, batch_size, temperature, top_k, top_p, eos_token, sample = VAE, args.batch_size, args.temperature, args.top_k, args.top_p, tokenizer.encoder['<|endoftext|>'], True out, _ = sample_sequence( model=VAE, tokenizer=tokenizer, length=length, batch_size=args.batch_size, x_mask=x_mask, x_tokens=x_tokens, y_mask=y_mask, y_tokens=y_tokens, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, device = device, eos_token=tokenizer.encoder['<|endoftext|>'], model_type=model_type ) out = out.tolist() # extract story, check metrics for i in range(len(out)): text = out[i] text = text[text.index(endoftext) + 1:] if endoftext in text: idx = text.index(endoftext) text = text[:idx] text = tokenizer.decode(text).strip() # score for one long text, higher than 0.075 usually means repetition # rep_score = repeat_score(text.split(), ngram=[3, 4, 5, 6, 7, 8]) # if rep_score > 0.075: # # print(rep_score) # continue try: # check bleu bleu4 = sentence_bleu([storys_str[i].split()], text, smoothing_function=SmoothingFunction().method7) # check rouge rouge = Rouge() rouge_scores = rouge.get_scores(text, storys_str[i]) rouge_scores_values = [v for k in rouge_scores[0].keys() for v in rouge_scores[0][k].values()] bleu4_sum += bleu4 rouge_scores_values_sum = [v1 + v2 for v1, v2 in zip(rouge_scores_values_sum, rouge_scores_values)] n_samples += 1 except: bleu4 = 0.0 rouge_scores = [{'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}}] eff_samples.append((text, bleu4, rouge_scores)) # write samples to file samples_file = open(save_folder + 'batch-' + '%04d' % i_test + '.txt', 'w', encoding='utf8') for i in range(len(eff_samples)): samples_file.write("=" * 50 + " SAMPLE " + str(i) + " " + "=" * 50) samples_file.write('\n' * 2) samples_file.write("=" * 40 + " Outlines " + "=" * 40) samples_file.write('\n' * 2) samples_file.write(tokenizer.decode(x_tokens[i, :][x_mask[i, :] == 1].tolist())) samples_file.write('\n' * 2) samples_file.write("=" * 40 + " Story " + "=" * 40) samples_file.write('\n' * 2) samples_file.write(storys_str[i]) samples_file.write('\n' * 2) samples_file.write("=" * 40 + " Generated " + "=" * 40) samples_file.write('\n' * 2) samples_file.write(eff_samples[i][0]) samples_file.write('\n' * 4) samples_file.flush() logging.info('batch %04d finished.', i_test) pbar.update(1) print('Test complete with %05d samples.' % n_samples) logging.info("Test complete with %05d samples.", n_samples) bleu4 = round(bleu4_sum / n_samples, 3) rouge_scores_values = [round(r / n_samples, 3) for r in rouge_scores_values_sum] print(' bleu-4:', bleu4) print(' rouge :', rouge_scores_values) logging.info(' bleu-4: %f', bleu4) logging.info(' rouge : %s', str(rouge_scores_values))
from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel, WEIGHTS_NAME, CONFIG_NAME # parameters model_size = "medium" # The fine-tuned DialoGPT models published by Microsoft on azure blob do not have a model config attached to them. # Model config with vocab and merges is available in /configs folder at https://github.com/microsoft/DialoGPT.git. # You can download the /configs folder from https://github.com/microsoft/DialoGPT.git and run the following code. # gpt2_config= {'small': GPT2Config.from_json_file('DialoGPT/configs/117M/config.json'), # 'medium': GPT2Config.from_json_file('DialoGPT/configs/345M/config.json'), # 'large': GPT2Config.from_json_file('DialoGPT/configs/762M/config.json')} # Alternatively the model config can also be manually set. # These are the default model config for gpt-2 small, medium and large models. gpt2_config = { 'small': GPT2Config(), 'medium': GPT2Config(n_ctx=1024, n_embd=1024, n_layer=24, n_head=16), 'large': GPT2Config(n_ctx=1024, n_embd=1280, n_layer=36, n_head=20) } # load the gpt2 tokenizer. All three gpt2 models (small, medium, large) use the same vocabulary. # A tokenizer is constructed from two files vocab.json and merges.txt. # Both of these files are available in configs/117M/, configs/345M/ and /configs/762M folders # tokenizer = GPT2Tokenizer.from_pretrained('DialoGPT/configs/345M') # Alternatively the following line of code will automatically download the vocab.json and merges.txt files # and create a tokenizer from these files tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # create transformer model from the fine-tuned model weights model = GPT2LMHeadModel(gpt2_config[model_size])
def __init__( self, model: str = None, config: Union[str, GPT2Config] = None, vocab_file: str = None, merges_file: str = None, tokenizer_file: str = None, schema_tokens: List[str] = None, schema_return: List[str] = None, cache_dir: str = "aitextgen", tf_gpt2: str = None, to_gpu: bool = False, to_fp16: bool = False, verbose: bool = False, gradient_checkpointing: bool = False, bos_token: str = None, eos_token: str = None, unk_token: str = None, **kwargs, ) -> None: if not verbose: for module in [ "transformers.file_utils", "transformers.configuration_utils", "transformers.tokenization_utils", "filelock", "transformers.modeling_gpt2", ]: logging.getLogger(module).setLevel(logging.WARN) logging.getLogger("transformers.modeling_utils").setLevel( logging.ERROR) if tf_gpt2: self.openai_tf_gpt2 = tf_gpt2 # Download + convert the TF weights if a PyTorch model has not been created if not os.path.isfile( os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")): assert tf_gpt2 in [ "124M", "355M", "774M", "1558M", ], "Invalid TensorFlow GPT-2 model size." logger.info( f"Downloading the {tf_gpt2} GPT-2 TensorFlow weights/config " + "from Google's servers") download_gpt2(cache_dir, tf_gpt2) logger.info( f"Converting the {tf_gpt2} GPT-2 TensorFlow weights to PyTorch." ) config_path = os.path.join(cache_dir, tf_gpt2, "hparams.json") convert_gpt2_checkpoint_to_pytorch( os.path.join(cache_dir, tf_gpt2), config_path, cache_dir, ) os.rename( os.path.join(cache_dir, "pytorch_model.bin"), os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin"), ) os.rename( os.path.join(cache_dir, "config.json"), os.path.join(cache_dir, f"config_{tf_gpt2}.json"), ) logger.info(f"Loading {tf_gpt2} GPT-2 model from /{cache_dir}.") model = os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin") config = os.path.join(cache_dir, f"config_{tf_gpt2}.json") self.model = GPT2LMHeadModel.from_pretrained(model, config=config) elif model and os.path.exists(model): # A pytorch_model.bin (+ optional config/config.json) is provided logger.info(f"Loading GPT-2 model from provided {model}.") if config is None: config = GPT2Config() self.model = GPT2LMHeadModel.from_pretrained(model, config=config) elif config: # Manually construct a GPT-2 model from scratch logger.info("Constructing GPT-2 model from provided config.") if isinstance(config, str): config = AutoConfig.from_pretrained(config) self.model = GPT2LMHeadModel(config=config) else: # Download and cache model from Huggingface if os.path.isdir(cache_dir) and len(os.listdir(cache_dir)) > 0: logger.info( f"Loading {model or 'gpt2'} model from /{cache_dir}.") else: logger.info( f"Downloading {model or 'gpt2'} model to /{cache_dir}.") self.model = GPT2LMHeadModel.from_pretrained(model or "gpt2", cache_dir=cache_dir) if model and "gpt2" not in model: logger.info(f"Using the tokenizer for {model}.") self.tokenizer = GPT2TokenizerFast.from_pretrained( model, cache_dir=cache_dir, ) if gradient_checkpointing or tf_gpt2 in ["355M", "774M", "1558M"]: logger.info("Gradient checkpointing enabled for model training.") setattr(self.model.config, "gradient_checkpointing", True) setattr(self.model.config, "use_cache", False) if schema_tokens: setattr(self.model.config, "schema_tokens", schema_tokens) if schema_tokens: setattr(self.model.config, "schema_return", schema_return) if self.tokenizer is None: # Update tokenizer settings (if not set already) args = locals() custom_tokenizer = False for attr in [ "vocab_file", "merges_file", "tokenizer_file", "bos_token", "eos_token", "unk_token", ]: if args[attr] is not None: custom_tokenizer = True setattr(self, attr, args[attr]) if custom_tokenizer: logger.info("Using a custom tokenizer.") else: logger.info("Using the default GPT-2 Tokenizer.") if tokenizer_file: # load the custom GPT-2 tokenizer from a serialized tokenizer self.tokenizer = GPT2TokenizerFast( vocab_file=None, merges_file=None, tokenizer_file=tokenizer_file, bos_token=self.bos_token, eos_token=self.eos_token, unk_token=self.unk_token, pad_token=self.pad_token, ) else: self.tokenizer = GPT2TokenizerFast( vocab_file=self.vocab_file, merges_file=self.merges_file, bos_token=self.bos_token, eos_token=self.eos_token, unk_token=self.unk_token, pad_token=self.pad_token, ) self.tokenizer.padding_side = "left" if to_gpu: if to_fp16: logger.warn( "Currently, FP16 text generation results in random output. " + "You may want to avoid using to_fp16 for the time being.") self.to_fp16() self.to_gpu()
#---------------------------------------------------------------------------------------# #---------------------------------------------------------------------------------------# # Model Initialization/Load #---------------------------------------------------------------------------------------# # Model Seed seed = random.randrange(1, 100) np.random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) # Model Configs: gpt2_small_config = GPT2Config() gpt2_medium_config = GPT2Config(n_ctx=1024, n_embd=1024, n_layer=24, n_head=16) gpt2_large_config = GPT2Config(n_ctx=1024, n_embd=1280, n_layer=36, n_head=20) model_size = "medium" tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Model Loads: model = GPT2LMHeadModel( GPT2Config(n_ctx=1024, n_embd=1024, n_layer=24, n_head=16)) model.load_state_dict(torch.load("medium_ft.pkl"), strict=False) device = torch.device("cuda") model = model.to(device) model.lm_head.weight.data = model.transformer.wte.weight.data
tokenizer = spm.SentencePieceProcessor() # here we assume that you have already added the special token from the dataset with open(args.tokenizer.replace(".model", ".vocab"), "r") as f: vocab_size = len(f.readlines()) config = GPT2Config( vocab_size=vocab_size, n_positions=args.maxlen, n_ctx=args.maxlen, n_embd=args.n_embd, n_layer=args.n_layer, n_head=args.n_head, activation_function=args.activation_function, resid_pdrop=args.resid_pdrop, embd_pdrop=args.embd_pdrop, attn_pdrop=args.attn_pdrop, layer_norm_epsilon=args.layer_norm_epsilon, initializer_range=args.initializer_range, summary_type=args.summary_type, summary_use_proj=args.summary_use_proj, summary_activation=args.summary_activation, summary_proj_to_labels=args.summary_proj_to_labels, summary_first_dropout=args.summary_first_dropout, bos_token_id=tokenizer.bos_id(), eos_token_id=tokenizer.eos_id(), ) model = GPT2LMHeadModel(config) else: print("🔋 Finetuning model from huggingface's transformers") tokenizer = GPT2Tokenizer.from_pretrained(args.model)
def main(): # Create the argument parser. parser = argparse.ArgumentParser() parser.add_argument("--print-checkpoint-structure", action="store_true") parser.add_argument( "path_to_checkpoint", type=str, help="Path to the checkpoint file (.zip archive or direct .pt file)", ) parser.add_argument( "--config_file", default="", type=str, help="An optional config json file describing the pre-trained model.", ) args = parser.parse_args() # Extract the basename. basename = os.path.dirname(args.path_to_checkpoint) # Load the model. # the .zip is very optional, let's keep it for backward compatibility print( f"Extracting PyTorch state dictionary from {args.path_to_checkpoint}") if args.path_to_checkpoint.endswith(".zip"): with zipfile.ZipFile(args.path_to_checkpoint, "r") as checkpoint: with checkpoint.open( "release/mp_rank_00/model_optim_rng.pt") as pytorch_dict: input_state_dict = torch.load(pytorch_dict, map_location="cpu") else: input_state_dict = torch.load(args.path_to_checkpoint, map_location="cpu") # Read the config, or default to the model released by NVIDIA. if args.config_file == "": # Spell out all parameters in case the defaults change. config = GPT2Config( vocab_size=50257, n_positions=1024, n_ctx=1024, n_embd=1024, n_layer=24, n_head=16, n_inner=4096, activation_function= "gelu", # used to be "gelu_new" in earlier versions resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, summary_type="cls_index", summary_use_proj=True, summary_activation=None, summary_proj_to_labels=True, summary_first_dropout=0.1, scale_attn_weights=True, use_cache=True, bos_token_id=50256, eos_token_id=50256, ) else: config = GPT2Config.from_json_file(args.config_file) # Convert. print("Converting") output_state_dict = convert_megatron_checkpoint(args, input_state_dict, config) # Print the structure of converted state dict. if args.print_checkpoint_structure: recursive_print(None, output_state_dict) # Store the config to file. output_config_file = os.path.join(basename, "config.json") output_config = config.to_dict() output_config["architectures"] = ["GPT2LMHeadModel"] output_config["model_type"] = "gpt2" print(f'Saving config to "{output_config_file}"') with open(output_config_file, "w") as f: json.dump(output_config, f) # Store the state_dict to file. output_checkpoint_file = os.path.join(basename, "pytorch_model.bin") print(f'Saving checkpoint to "{output_checkpoint_file}"') torch.save(output_state_dict, output_checkpoint_file)
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config from sentence_transformers import SentenceTransformer from annoy import AnnoyIndex import pandas as pd import random import torch import pickle import time import slack import os import re import wget # Stuff for nlg gpt2_medium_config = GPT2Config(n_ctx=1024, n_embd=1024, n_layer=24, n_head=16) tokenizer = GPT2Tokenizer.from_pretrained("gpt2") model = GPT2LMHeadModel(gpt2_medium_config) model.load_state_dict(torch.load('/datafiles/medium_ft.pkl'), strict=False) print('Tokenizer and model ready..') # More stuff for nlg eos = [tokenizer.encoder["<|endoftext|>"]] num_words = 50 device = torch.device('cpu') model.to(device) model.lm_head.weight.data = model.transformer.wte.weight.data # Load indexes bert_annoy = AnnoyIndex(768, 'angular') bert_annoy.load('/datafiles/dim768-trees13.ann')