def __init__(self, model_path, generation_type, use_finetuned=True): self.model_path = model_path self.batch_size = int(args["--batch-size"]) self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2") self.MAX_LEN = { GENERATION_TYPE_SMALL: 20, GENERATION_TYPE_LARGE: 500 }[generation_type] logger.info( f"Using {generation_type} for decoding, MAX_LEN={self.MAX_LEN}") if use_finetuned: logger.info("Using a finetuned model") self.config = GPT2Config.from_pretrained(self.model_path) model = GPT2LMHeadModel.from_pretrained(self.model_path) with open(f"{self.model_path}/special_tokens_map.json", "r") as f: special_tokens = json.load(f) self.tokenizer.add_special_tokens(special_tokens) else: logger.info("NOT using a finetuned model") model = GPT2LMHeadModel(config=GPT2Config.from_pretrained( pretrained_model_name_or_path=self.model_path)) self.model = model.cuda() self.model.eval()
def __init__(self, config, dataset): super(GPT2Seq, self).__init__(config, dataset) self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = GPT2TokenizerFast.from_pretrained( self.pretrained_model_path, pad_token='[PAD]') self.configuration = GPT2Config.from_pretrained( self.pretrained_model_path, pad_token_id=self.padding_token_idx) self.model = GPT2LMHeadModel.from_pretrained( self.pretrained_model_path, config=self.configuration) self.model.resize_token_embeddings(len(self.tokenizer)) if config['task_type'] == "summarization": self.task_text = "TL;DR:" elif config['task_type'] == "translation": self.task_text = "story:" elif config['task_type'] == "multi_dialog": self.task_text = "question:" else: raise NotImplementedError( "Only summarization and translation are supported.") self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def __init__(self, hparams): super().__init__() self.hparams = hparams self.d = None self.tokenizer = None # hotfixes if 'unfreeze' not in hparams: self.hparams.unfreeze = False if 'lang' not in hparams: self.hparams.lang = 'nld' autofix_paths(self.hparams) # GPT with LM head and correct embedding size with open(Path('data') / self.hparams.lang / 'config.json') as f: cfg = json.load(f) if self.hparams.unfreeze: self.n_unfreeze = 0 if self.hparams.resume_from_checkpoint is not None: print('Resuming from checkpoint: unfreezing all layers') self.n_unfreeze = None config = GPT2Config.from_pretrained(self.hparams.pretrained_path, **cfg) if self.hparams.unfreeze and self.n_unfreeze is not None: config.torchscript = True self.m = GPT2LMHeadModel.from_pretrained(self.hparams.pretrained_path, config=config) # Resize vocab self.m.resize_token_embeddings(self.hparams.vocab_size)
def load_model(train_steps, num_warmup_steps): try: # try to load finetuned model at local. tokenizer = load_tokenizer() config = GPT2Config.from_pretrained(configs.model_path, return_dict=False) model = TFGPT2LMHeadModel.from_pretrained(configs.model_path, return_dict=False) print("model loaded from local!") except Exception as e: tokenizer = BertTokenizer.from_pretrained( "mymusise/gpt2-medium-chinese") model = TFGPT2LMHeadModel.from_pretrained( "mymusise/gpt2-medium-chinese", return_dict=False) print("model loaded from remote!") loss = model.compute_loss optimizer = nlp.optimization.create_optimizer( 5e-5, num_train_steps=train_steps, num_warmup_steps=num_warmup_steps) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') model.compile( optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], # metrics=[metric] ) return model
def __init__(self, model_path): config = GPT2Config.from_pretrained(model_path) config.output_hidden_states=True config.output_attentions = True self.model = GPT2LMHeadModel.from_pretrained(model_path, config=config) self.model.eval() self.context = ''
def load(cls, pretrained_model_name_or_path, language=None, **kwargs): """ Load a pretrained model by supplying * the name of a remote model on s3 ("gpt2" ...) * OR a local path of a model trained via transformers ("some_dir/huggingface_model") * OR a local path of a model trained via FARM ("some_dir/farm_model") :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. :type pretrained_model_name_or_path: str """ gpt2 = cls() if "farm_lm_name" in kwargs: gpt2.name = kwargs["farm_lm_name"] else: gpt2.name = pretrained_model_name_or_path # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = Path( pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(farm_lm_config): # FARM style gpt2_config = GPT2Config.from_pretrained(farm_lm_config) farm_lm_model = Path( pretrained_model_name_or_path) / "language_model.bin" gpt2.model = GPT2Model.from_pretrained(farm_lm_model, config=gpt2_config, **kwargs) gpt2.language = gpt2.model.config.language else: # Pytorch-transformer Style gpt2.model = GPT2Model.from_pretrained( str(pretrained_model_name_or_path), **kwargs) gpt2.language = cls._get_or_infer_language_from_name( language, pretrained_model_name_or_path) return gpt2
def __init__(self, train_dataloader, val_dataloader=None): """ Initialises Trainer by defining model and GPU Args: train_dataloader: torch.utils.data.DataLoader Dataloader to train model upon, obtained from Dataloader class val_dataloader: Optional torch.utils.data.DataLoader Dataloader to validate model upon obtained from DataLoader class, not required if Trainer is only used for final training """ # Create GPT2 Config config = GPT2Config.from_pretrained("gpt2") # Load language head model and input default config model = GPT2LMHeadModel.from_pretrained("gpt2", config=config) # Recreate tokenizer tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') # Tell model we have added bos, eos, pad token model.resize_token_embeddings(len(tokenizer)) # Tell pytorch to run this model on the GPU. device = torch.device("cuda") model.cuda() self.model = model self.device = device self.train_dataloader = train_dataloader self.val_dataloader = val_dataloader
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--config_path", default="../../models/gpt2/gpt2-config.json", type=str, required=False) parser.add_argument("--model_path", default="../../models/gpt2/gpt2-pytorch_model.bin", type=str, required=False) parser.add_argument("--vocab_path", default="../../models/gpt2/gpt2-vocab.json", type=str, required=False) parser.add_argument("--merges_path", default="../../models/gpt2/gpt2-merges.txt", type=str, required=False) parser.add_argument( "--sentence", default="In this article, I am excited to take you through", type=str, required=False) args = parser.parse_args() config = GPT2Config.from_pretrained(args.config_path) model = GPT2LMHeadModel.from_pretrained(args.model_path, config=config) tokenizer = GPT2Tokenizer(args.vocab_path, args.merges_path) # logging.basicConfig(filename="default.txt", level=logging.DEBUG, filemode='w') # gpt2_generate_greedy(model, tokenizer, sentence=sys.argv[1]) gpt2_generate_beam_search(model, tokenizer, sentence=args.sentence)
def __init__(self, config, dataset): super(GPT2, self).__init__(config, dataset) self.pretrained_model_path = config['pretrained_model_path'] self.tokenizer = GPT2Tokenizer.from_pretrained( self.pretrained_model_path, bos_token=dataset.sos_token, eos_token=dataset.eos_token, pad_token=dataset.padding_token) self.sos_token = self.tokenizer.bos_token self.eos_token = self.tokenizer.eos_token self.sos_token_idx = self.tokenizer.bos_token_id self.eos_token_idx = self.tokenizer.eos_token_id self.padding_token_idx = self.tokenizer.pad_token_id self.max_seq_length = config['max_seq_length'] self.configuration = GPT2Config.from_pretrained( self.pretrained_model_path, bos_token_id=self.sos_token_idx, eos_token_id=self.eos_token_idx, pad_token_id=self.padding_token_idx) self.decoder = GPT2LMHeadModel.from_pretrained( self.pretrained_model_path, config=self.configuration) self.decoder.resize_token_embeddings(len(self.tokenizer)) self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx, reduction='none')
def load_model(checkpoint_dir): ckpt_dir = Path(checkpoint_dir) config = GPT2Config.from_pretrained(ckpt_dir / "config.json") tokenizer = GPT2Tokenizer.from_pretrained(str(ckpt_dir)) model = GPT2TANDAModel(config) model.load_state_dict(torch.load(ckpt_dir / "pytorch_model.bin")) model.eval() return model, tokenizer
def main(): parser = argparse.ArgumentParser() parser.add_argument('--hidden_layer_num', type=int, help="Number 0..48 of the layer to get hidden states from") parser.add_argument('--batch_size', type=int, default=32) args = parser.parse_args() tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') config = GPT2Config.from_pretrained('gpt2-medium', output_hidden_states=True) gpt2 = GPT2Model.from_pretrained('gpt2-medium', config=config).cuda() logging.getLogger("transformers.tokenization_utils").setLevel( logging.ERROR) for subsample in ["train", "test"]: if not os.path.isdir(subsample): os.mkdir(subsample) df = pd.read_csv('{}.csv'.format(subsample)) if os.path.isfile(f'{subsample}_tokens_gpt2.pkl'): print("Loading token ids...", file=sys.stderr) tokens = joblib.load(f'{subsample}_gpt2.pkl') else: print("Transforming texts to token ids...", file=sys.stderr) tokens = [tokenizer.encode(x) for x in tqdm(df.texts)] joblib.dump(tokens, f'{subsample}_gpt2.pkl') dataset = DiscourseDataset(tokens, pad_token_id=0, max_len=config.n_positions) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size) gpt2.eval() mean_results, max_results = list(), list() with torch.no_grad(): for num, (token_ids, attention_ids) in enumerate(tqdm(dataloader), 1): _, _, hidden_states = gpt2(token_ids, attention_mask=attention_ids) hidden_states_cpu = [x.cpu().numpy() for x in hidden_states] del hidden_states gc.collect() output = hidden_states_cpu[args.hidden_layer_num] del hidden_states_cpu sentence_lens = attention_ids.sum(1).cpu().numpy() output_zero_padding = output.transpose([2, 0, 1]) * attention_ids.cpu().numpy() output_zero_padding = output_zero_padding.transpose([1, 2, 0]) mean_result = (output_zero_padding.sum(1).T / sentence_lens).T max_result = np.array([matrix[:length].max(0) for matrix, length in zip(output_zero_padding, sentence_lens)]) mean_results.append(mean_result) max_results.append(max_result) torch.cuda.empty_cache() np.save(f'{subsample}/gpt2_mean_embeddings_layer_{args.hidden_layer_num}', np.vstack(mean_results)) np.save(f'{subsample}/gpt2_max_embeddings_layer_{args.hidden_layer_num}', np.vstack(max_results))
def __init__(self): self.batch_size = int(args["--batch-size"]) self.config = GPT2Config.from_pretrained("gpt2-medium") self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2-medium') self.model = model.cuda() self.model.eval()
def __init__(self, max_output_length=25, max_input_length=300, device='cpu', tokenizer_type='gpt2', bpe_model="", starter_model=None): if tokenizer_type == "gpt2": self.tokenizer = utils_tokenizer.GPT2Tokenizer() config = GPT2Config.from_pretrained("gpt2") elif tokenizer_type == "bpecap": self.tokenizer = utils_tokenizer.BPETokenizer(bpe_model) config = GPT2Config.from_dict({ "finetuning_task": None, "initializer_range": 0.02, "layer_norm_epsilon": 1e-05, "n_ctx": 1024, "n_embd": 768, "n_head": 12, "n_layer": 12, "n_positions": 1024, "num_labels": 1, "resid_pdrop": 0.1, "use_bfloat16": False, "vocab_size": self.tokenizer.vocab_size }) else: print("Tokenizer unrecognized. Should be gpt2 or bpecap.") exit() self.model = GPT2LMHeadModel(config) self.model.to(device) self.device = device if starter_model is not None: self.reload(starter_model) self.max_output_length = max_output_length self.max_input_length = max_input_length self.model.train() self.mode = "train"
def __init__(self, model_name: str) -> None: super().__init__() config = GPT2Config.from_pretrained(model_name) self.input_dim = config.hidden_size self.output_dim = config.vocab_size # TODO(mattg): It's possible that we could use some kind of cache like we have in # allennlp.modules.token_embedders.bert_token_embedder.PretrainedBertModel. That way, we # would only load the GPT2 weights once. Though, it's not clear how to do that here, as we # need to load `GPT2LMHeadModel`, not just `GPT2Model`... gpt2_model = GPT2LMHeadModel.from_pretrained(model_name) self.gpt2_lm_head = gpt2_model.lm_head
def test_train_with_configs(self): MODEL_ID = "sshleifer/tiny-gpt2" config = GPT2Config.from_pretrained(MODEL_ID) benchmark_args = PyTorchBenchmarkArguments(models=[MODEL_ID], training=True, no_inference=True, sequence_lengths=[8], batch_sizes=[1]) benchmark = PyTorchBenchmark(benchmark_args, configs=[config]) results = benchmark.run() self.check_results_dict_not_empty(results.time_train_result) self.check_results_dict_not_empty(results.memory_train_result)
def load(self, checkpoint_path): """ Load the model, etc """ logging.info("Loading model") config = GPT2Config.from_pretrained(checkpoint_path) model = GPT2SegmentedModel.from_pretrained( checkpoint_path, config=config, cache_dir=self.args.cache_dir) if torch.cuda.is_available(): model = model.cuda() self.model = StaticDataParallel(model)
def __init__(self, vilbert, gpt2_tokenizer, gpt2_embed_dim=768, config=None): nn.Module.__init__(self) self.gpt2_tokenizer = gpt2_tokenizer self.gpt2_embed_dim = gpt2_embed_dim self.embed = torch.nn.Linear(config.bi_hidden_size, self.gpt2_embed_dim) self.gpt2_config = GPT2Config.from_pretrained('gpt2') self.gpt2_model = GPT2LMHeadModel.from_pretrained( 'gpt2', from_tf=False, config=self.gpt2_config) self.vilbert_model = vilbert
def __init__(self, model_path): self.model_path = model_path self.batch_size = int(args["--batch-size"]) self.config = GPT2Config.from_pretrained(self.model_path) with open(f"{self.model_path}/special_tokens_map.json", "r") as f: special_tokens = json.load(f) # special_tokens["pad_token"] = Gpt2Generator.PAD_TOKEN self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2') self.tokenizer.add_special_tokens(special_tokens) model = GPT2LMHeadModel.from_pretrained(self.model_path) self.model = model.cuda() self.model.eval()
def __init__(self, tokenizer, gpt2_config, segment=True): config = GPT2Config.from_pretrained(gpt2_config) super(GPT2Summ, self).__init__(config) self.transformer = GPT2Model.from_pretrained(gpt2_config) self.transformer.resize_token_embeddings(len(tokenizer)) self.user_id = [ tokenizer.convert_tokens_to_ids('<user1>'), tokenizer.convert_tokens_to_ids('<user2>') ] self.know_id = tokenizer.convert_tokens_to_ids('<knowledge>') self.segment = segment self.lm_head = nn.Linear(config.n_embd, len(tokenizer), bias=False) self.config.vocab_size = len(tokenizer) self.tie_weights()
def build_model(args): if args.pretrained_path == '': config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config) tokenizer = BertTokenizerFast(args.vocab) # XXX: must add this, or can't tokenize special token in string to single char tokenizer.sanitize_special_tokens() info = None else: config = GPT2Config.from_pretrained(args.pretrained_path) model, info = GPT2LMHeadModel.from_pretrained(args.pretrained_path, config=config, output_loading_info=True) tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_path) return model, tokenizer, info
def test_language_generate_greedy(max_len=64): config = GPT2Config.from_pretrained("./models/gpt2/config.json") model = GPT2LMHeadModel.from_pretrained("./models/gpt2/pytorch_model.bin", config=config) token_list = [3, 5, 2] input = torch.tensor(token_list).unsqueeze(0) model.eval() with torch.no_grad(): for i in range(max_len): output = model(input)[0] output_id = int(output.max(2)[1][0, -1]) token_list.append(output_id) input = torch.tensor(token_list[-24:]).unsqueeze(0) if output_id == 0: break print(token_list)
def _initialize(self): """ Load the dataset, model, etc """ cache_dir = self.args.cache_dir model_name = self.args.model.model_name logging.info("Loading dataset") self.dataset = StoriumDataset("train", "gpt2", cache_dir=cache_dir) self.dataset.load(self.args.data_dir) # By default the config outputs "past", but that makes our chunked # scattering (needed when batching based on tokens, rather than # examples) fail since the huggingface/transformers package stacks the # outputs on dim 0, which is normally the batch dimension. This leads # to errors like: # # RuntimeError: Gather got an input of invalid size: got [2, 5, 12, # 411, 64], but expected [2, 4, 12, 411, 64] (gather at # /pytorch/torch/csrc/cuda/comm.cpp:226) # # During training we only care about the loss, so just disable all # additional outputs. config = GPT2Config.from_pretrained(model_name, cache_dir=cache_dir) config.output_hidden_states = False config.output_attentions = False config.output_past = False model = GPT2SegmentedModel.from_pretrained(model_name, config=config, cache_dir=cache_dir) tokenizer = self.dataset.get_tokenizer() model.resize_token_embeddings(len(tokenizer)) max_steps = self.args.optim.max_steps optimizer = AdamW(model.parameters(), lr=self.args.optim.lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_training_steps=max_steps, num_warmup_steps=self.args.optim.warmup_steps, ) # Track the modules self.modules["model"] = model self.modules["optimizer"] = optimizer self.modules["scheduler"] = scheduler
def main(): device = torch.device("cuda") args = parser.parse_args() args.n_gpu = torch.cuda.device_count() args.device = device logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) if args.local_rank not in [-1, 0]: torch.distributed.barrier() config = GPT2Config.from_pretrained(args.model_path) tokenizer = GPT2Tokenizer.from_pretrained(args.model_path, do_lower_case=False) if args.block_size <= 0: args.block_size = tokenizer.max_len_single_sentence args.block_size = min(args.block_size, tokenizer.max_len_single_sentence) model = GPT2LMHeadModel.from_pretrained(args.model_path, config=config) model.to(args.device) if args.local_rank == 0: torch.distributed.barrier() logger.info("Training/evaluation parameters %s", args) if args.local_rank not in [-1, 0]: torch.distributed.barrier() train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) if args.local_rank == 0: torch.distributed.barrier() run_train(args, train_dataset, model, tokenizer) if args.local_rank == -1 or torch.distributed.get_rank() == 0: if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
def __init__(self): # tokenizer self.tokenizer = GPT2Tokenizer.from_pretrained(GPT2_TYPE) # special tokens self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) self.tokenizer.add_special_tokens( {'additional_special_tokens': dataset_tokens}) # chess tokens self.tokenizer.add_tokens(get_chess_tokens()) # model self.configuration = GPT2Config.from_pretrained(GPT2_TYPE) self.model = GPT2LMHeadModel.from_pretrained( GPT2_TYPE, config=self.configuration).cuda() self.model.resize_token_embeddings(len(self.tokenizer))
def get_bert_config(bert_model_type, output_hidden_states=False): if bert_model_type in [ 'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'tune_bert-base-uncased_nsp', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: bert_config = BertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'roberta-base', 'prod-roberta-base-cased', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: bert_config = RobertaConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['xlnet-base-cased']: bert_config = XLNetConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: bert_config = AlbertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['gpt2', 'gpt2-medium']: bert_config = GPT2Config.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['transfo-xl']: bert_config = TransfoXLConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: bert_config = DistilBertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}') bert_config.output_hidden_states = output_hidden_states return bert_config
def get_model(extra_args): if extra_args.use_english_weights: model = GPT2LMHeadModel.from_pretrained(extra_args.identifier) else: model = GPT2LMHeadModel( GPT2Config.from_pretrained(extra_args.identifier)) wte = model.transformer.wte if extra_args.wte_path is not None: wte.weight = nn.Parameter(torch.load(extra_args.wte_path)) else: mean, std = wte.weight.mean().item(), wte.weight.std().item() wte.weight = nn.Parameter( torch.normal(mean, std, size=wte.weight.size())) # tie input and output embeddings model.lm_head.weight = model.transformer.wte.weight model.tie_weights() return model
def load_finetuned_gpt2(weight_path: str, zero_shot: bool = False): # Load the model if zero_shot: model = GPT2LMHeadModel.from_pretrained('gpt2') else: config = GPT2Config.from_pretrained('gpt2') model = GPT2LMHeadModel(config) print('{} defined'.format(model.__class__.__name__, )) model_weights = torch.load( weight_path, map_location=lambda storage, loc: storage)['state_dict'] corrected_model_weights = {} for k, v in model_weights.items(): corrected_model_weights[k.replace('model.', '')] = v print('Loaded model weights from {}'.format(weight_path)) model.load_state_dict(corrected_model_weights, strict=True) print('{} loaded with checkpoint weights and sent to GPU!'.format( model.__class__.__name__, )) return model
def load_dialogpt_zeroshot(weight_path: str): # Create object config = GPT2Config.from_pretrained('gpt2') model = GPT2LMHeadModel(config) print('{} defined'.format(model.__class__.__name__, )) # Obtain weights from file model_weights = torch.load(weight_path, map_location=lambda storage, loc: storage) # Load model weights model_weights = { k.replace('module.', ''): v for k, v in model_weights.items() } if 'lm_head.decoder.weight' in model_weights: model_weights['lm_head.weight'] = model_weights.pop( 'lm_head.decoder.weight' ) # Compatibility with newer versions of `transformers` package model.load_state_dict(model_weights, strict=True) print('Model loaded from {}'.format(weight_path)) return model
epochs = 1 learning_rate = 1e-5 # warmup_steps = 1e2 epsilon = 1e-8 # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) SAVE_PATH = "/mnt/nfs/work1/llcao/zonghaiyao/LM/" # I'm not really doing anything with the config buheret configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False) # Load the GPT tokenizer. tokenizer = GPT2Tokenizer.from_pretrained( 'gpt2', pad_token='<|endoftext|>') #gpt2-medium # instantiate the model # model = rerankGPT2LMHeadModel_stage1_all_tokens_stage2_all_tokens.from_pretrained("/mnt/nfs/work1/llcao/zonghaiyao/LM/results/stage1_all_tokens_start_after_finetuning/stage1_all_tokens_stage2_all_tokens/lr5e4_add_scrach_bs24/200000", # config=configuration, # MAX_LEN = MAX_LEN, # CAN_NUM = CAN_NUM, # num_of_rerank = num_of_rerank) model = rerankGPT2LMHeadModel_stage1_all_tokens_stage2_all_tokens.from_pretrained( "gpt2", config=configuration, MAX_LEN=MAX_LEN,
def fine_tune_gpt2(): # I'm not really doing anything with the config buheret configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False) # instantiate the model model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration) # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings # otherwise the tokenizer and model tensors won't match up model.resize_token_embeddings(len(tokenizer)) # Tell pytorch to run this model on the GPU. device = torch.device("cuda") model.cuda() # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # some parameters I cooked up that work reasonably well epochs = 5 learning_rate = 5e-4 warmup_steps = 1e2 epsilon = 1e-8 # this produces sample output every 100 steps sample_every = 100 # Note: AdamW is a class from the huggingface library (as opposed to pytorch) optimizer = AdamW(model.parameters(), lr = learning_rate, eps = epsilon ) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # This changes the learning rate as the training loop progresses scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps = total_steps) def format_time(elapsed): return str(datetime.timedelta(seconds=int(round((elapsed))))) total_t0 = time.time() training_stats = [] model = model.to(device) for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') t0 = time.time() total_train_loss = 0 model.train() for step, batch in enumerate(train_dataloader): b_input_ids = batch[0].to(device) b_labels = batch[0].to(device) b_masks = batch[1].to(device) model.zero_grad() outputs = model( b_input_ids, labels=b_labels, attention_mask = b_masks, token_type_ids=None ) loss = outputs[0] batch_loss = loss.item() total_train_loss += batch_loss # Get sample every x batches. if step % sample_every == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Loss: {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed)) model.eval() sample_outputs = model.generate( bos_token_id=random.randint(1,30000), do_sample=True, top_k=50, max_length = 200, top_p=0.95, num_return_sequences=1 ) for i, sample_output in enumerate(sample_outputs): print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True))) model.train() loss.backward() optimizer.step() scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(training_time)) # ======================================== # Validation # ======================================== print("") print("Running Validation...") t0 = time.time() model.eval() total_eval_loss = 0 nb_eval_steps = 0 # Evaluate data for one epoch for batch in validation_dataloader: b_input_ids = batch[0].to(device) b_labels = batch[0].to(device) b_masks = batch[1].to(device) with torch.no_grad(): outputs = model(b_input_ids, # token_type_ids=None, attention_mask = b_masks, labels=b_labels) loss = outputs[0] batch_loss = loss.item() total_eval_loss += batch_loss avg_val_loss = total_eval_loss / len(validation_dataloader) validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) # Record all statistics from this epoch. training_stats.append( { 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Training Time': training_time, 'Validation Time': validation_time } ) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))