def prepare_config_and_inputs(self, gradient_checkpointing=False): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = random_attention_mask( [self.batch_size, self.seq_length]) config = GPTNeoConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_layers=self.num_hidden_layers, num_heads=self.num_attention_heads, max_position_embeddings=self.max_position_embeddings, use_cache=False, bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, pad_token_id=self.pad_token_id, window_size=self.window_size, attention_types=self.attention_types, gradient_checkpointing=gradient_checkpointing, ) return (config, input_ids, input_mask)
def prepare_config_and_inputs(self, gradient_checkpointing=False): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = random_attention_mask( [self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) mc_token_ids = None if self.use_mc_token_ids: mc_token_ids = ids_tensor([self.batch_size, self.num_choices], self.seq_length) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = GPTNeoConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_layers=self.num_hidden_layers, num_heads=self.num_attention_heads, max_position_embeddings=self.max_position_embeddings, use_cache=not gradient_checkpointing, bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, pad_token_id=self.pad_token_id, gradient_checkpointing=gradient_checkpointing, window_size=self.window_size, attention_types=self.attention_types, ) head_mask = ids_tensor( [self.num_hidden_layers, self.num_attention_heads], 2) return ( config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, token_labels, choice_labels, )
def get_config(self): return GPTNeoConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_layers=self.num_hidden_layers, num_heads=self.num_attention_heads, max_position_embeddings=self.max_position_embeddings, use_cache=True, bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, pad_token_id=self.pad_token_id, window_size=self.window_size, attention_types=self.attention_types, )
def get_config(self, gradient_checkpointing=False): return GPTNeoConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_layers=self.num_hidden_layers, num_heads=self.num_attention_heads, max_position_embeddings=self.max_position_embeddings, use_cache=not gradient_checkpointing, bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, pad_token_id=self.pad_token_id, gradient_checkpointing=gradient_checkpointing, window_size=self.window_size, attention_types=self.attention_types, )
def test_create_attention_mask(self): config = GPTNeoConfig.from_pretrained("valhalla/gpt-neo-random-tiny") window_size = config.window_size batch_size, seq_length = 8, 1 block_length, num_blocks = GPTNeoAttentionMixin._get_block_length_and_num_blocks( seq_length, window_size) # causal_mask = layer._create_attention_mask(batch_size, seq_length, num_blocks, block_length, torch_device) causal_mask = GPTNeoAttentionMixin.create_local_attention_mask( batch_size, seq_length, config.window_size, torch_device) # check shapes expected_shape = [ batch_size, num_blocks, 1, block_length, window_size + block_length ] self.assertListEqual(list(causal_mask.shape), expected_shape) # first window_size tokens in the first block are always padded # and should not be attended self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0)) # each window can attend at most window_size tokens self.assertTrue( torch.all(torch.sum(causal_mask, dim=4) <= config.window_size)) # check if user provided attention_mask is handled correctly attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long, device=torch_device) attention_mask[:, -3:] = 0 # don't attend last 3 tokens # causal_mask = layer._create_attention_mask( # batch_size, seq_length, num_blocks, block_length, torch_device, attention_mask # ) causal_mask = GPTNeoAttentionMixin.create_local_attention_mask( batch_size, seq_length, config.window_size, torch_device, attention_mask) # last 3 tokens will be in the last block and shoul have 0s in causal_mask self.assertTrue(torch.all(causal_mask[:, -1, :, :, -3:] == 0)) # check shapes expected_shape = [ batch_size, num_blocks, 1, block_length, window_size + block_length ] self.assertListEqual(list(causal_mask.shape), expected_shape) # first window_size tokens in the first block are always padded # and should not be attended self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0)) # each window can attend at most window_size tokens self.assertTrue( torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))
def GPTNeoConfigCPU(vocab_size: int = 1000, bos_token_id: int = 0, eos_token_id: int = 0, **kwargs): """ Returns a GPT Neo config more suitable for training on a regular consumer CPU. """ return GPTNeoConfig( vocab_size=vocab_size, max_position_embeddings=64, hidden_size=256, window_size=32, intermediate_size=256, attention_types=[[["global", "local"], 2]], num_layers=4, num_heads=4, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs, )
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model config_json = json.load(open(config_file, "r")) config = GPTNeoConfig( hidden_size=config_json["n_embd"], num_layers=config_json["n_layer"], num_heads=config_json["n_head"], attention_types=config_json["attention_types"], max_position_embeddings=config_json["n_positions"], resid_dropout=config_json["res_dropout"], embed_dropout=config_json["embed_dropout"], attention_dropout=config_json["attn_dropout"], ) print(f"Building PyTorch model from configuration: {config}") model = GPTNeoForCausalLM(config) # Load weights from tf checkpoint load_tf_weights_in_gpt_neo(model, config, tf_checkpoint_path) # Save pytorch-model print(f"Save PyTorch model to {pytorch_dump_path}") model.save_pretrained(pytorch_dump_path)
def get_large_model_config(self): return GPTNeoConfig.from_pretrained("gpt-neo-125M")
GPT2Tokenizer, GPTNeoConfig, AdamW from torch.utils.data import IterableDataset, DataLoader from lm_dataformat import * import torch import torch.nn.functional as F from torch.nn.functional import normalize, cross_entropy from torch.nn import DataParallel from auto_tqdm import tqdm from get_args import get_args import deepspeed args = get_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #create model, set neo_hidden conf = GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-1.3B") conf.gradient_checkpointing = True model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", config=conf) model.training = True tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") neo_hidden = model.config.hidden_size #resize token embeddings. Two extra tokens model.resize_token_embeddings(len(tokenizer) + 2) #Set up deep speed model_engine, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) model_engine.to(model_engine.local_rank) #Initialize a random projection matrix clip_hidden = 512 projection = torch.nn.Linear(neo_hidden, clip_hidden,