def __init__(self, model_name, num_labels: int, clf_dropout=0.2): super().__init__() self.transformer = GPT2Model.from_pretrained(model_name) self.dropout = nn.Dropout(clf_dropout) self.linear = nn.Linear(self.transformer.config.n_embd * 2, num_labels) nn.init.normal_(self.linear.weight, std=0.02) nn.init.normal_(self.linear.bias, 0)
def extract_gpt2_hidden_activations( text_path, save_activs_to): #, mode="full_model", focus_layers=[]): # read in text samples to pass through single layer of gpt2 model text_inputs = [] with open(text_path, "rb") as infile: text_inputs = pickle.load(infile) # num_inputs = len(text_inputs) # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # get the hidden activations - assumes a gpu is available layer_activs = [] for text in text_inputs: # tokenize text indexed_tokens = tokenizer.encode(text) tokens_tensor = torch.tensor([indexed_tokens]).to('cuda') # set up model model = GPT2Model.from_pretrained('gpt2') model.eval() model.to('cuda') # grab the hidden activations and save them to layer_actives with torch.no_grad(): hidden, _ = model(tokens_tensor) layer_activs.append(hidden.cpu().numpy().squeeze()) # clear gpu memory in preparation for next text sample torch.cuda.empty_cache() # save layer dimensions with open(save_activs_to, "wb") as outfile: pickle.dump(layer_activs, outfile) pass
def download_model(name): if not name in MODELS: raise Exception(str(name) + ' not a model in the list') if not exists(PATH): print("# ", str(PATH), "not found, creating dir.") mkdir(PATH) print('# Downloading model: ' + str(name)) name_path = MODEL_PATH_DICT[name] if name == 'word2vec': if not exists(join(PATH, name_path)): wget.download( 'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz' ) shutil.move(name_path, join(PATH, name_path)) print('# Downloaded word2vec') else: print('# Already downloaded') if name == 'glove': if not exists(join(PATH, name_path)): wget.download( 'http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip') zip = zipfile.ZipFile('./glove.840B.300d.zip') zip.extractall() _ = glove2word2vec('./glove.840B.300d.txt', join(PATH, name_path)) print('# Downloaded glove') else: print('# Already downloaded') if name == 'dict2vec': if not exists(join(PATH, name_path)): wget.download( 'https://dict2vec.s3.amazonaws.com/dict2vec300.tar.bz2') tar = tarfile.open("dict2vec300.tar.bz2") tar.extractall() tar.close() shutil.move(name_path, join(PATH, name_path)) print('# Downloaded dict2vec') else: print('# Already downloaded') if name == 'conceptnet': if not exists(join(PATH, name_path)): wget.download( 'https://conceptnet.s3.amazonaws.com/downloads/2019/numberbatch/numberbatch-en-19.08.txt.gz' ) shutil.move(name_path, join(PATH, name_path)) print('# Downloaded Conceptnet Numberbatch') else: print('# Already downloaded') if name == 'bert' or name == 'bert-context': _ = BertTokenizer.from_pretrained('bert-large-uncased') _ = BertModel.from_pretrained( 'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy( ) print('# Downloaded bert') if name == 'gpt2' or name == 'gpt2-context': _ = GPT2Tokenizer.from_pretrained('gpt2') _ = GPT2LMHeadModel.from_pretrained('gpt2') _ = GPT2Model.from_pretrained('gpt2') print('# Downloaded gpt-2')
def construct_encoder(self): model = GPT2Model.from_pretrained(self.model_name) model.cuda() model = torch.nn.DataParallel(model) model.eval() tokenizer = GPT2Tokenizer.from_pretrained(self.model_name) print("Model and tokenzier are constructed!") return model, tokenizer
def __init__(self, config, clf_dropout=0.4, n_class=8): super(GPT2ClassificationHeadModel, self).__init__(config) self.transformer = GPT2Model(config) self.dropout = nn.Dropout(clf_dropout) self.linear = nn.Linear(config.n_embd * 3, n_class) nn.init.normal_(self.linear.weight, std=0.02) nn.init.normal_(self.linear.bias, 0) self.apply(self.init_weights)
def create_gpt2_model(self, config, input_ids, token_type_ids, position_ids, mc_labels, lm_labels, mc_token_ids): model = GPT2Model(config) model.eval() hidden_states, presents = model(input_ids, position_ids, token_type_ids) outputs = { "hidden_states": hidden_states, "presents": presents, } return outputs
def __init__(self, pretrain_path, dropout=0.1): super(Gpt2Model, self).__init__() self.bert = GPT2Model.from_pretrained(pretrain_path) self.dropout = nn.Dropout(dropout) self.aux_head = nn.Sequential( OrderedDict([ ('dropout', nn.Dropout(dropout)), ('clf', nn.Linear(self.bert.config.n_embd, 6)), ])) self.main_head = nn.Sequential( OrderedDict([('dropout', nn.Dropout(dropout)), ('clf', nn.Linear(self.bert.config.n_embd, 1))]))
def __init__(self, config, cls_id, clf_dropout=0.4, n_class=8, head_start_layer=0): super(GPT2ClassificationHeadModel, self).__init__(config) self.transformer = GPT2Model(config) self.apply(self.init_weights) self.head = CustomHead(config, n_class, dropout=clf_dropout, start_layer=head_start_layer) self.cls_id = cls_id
def from_opt(cls, opt): if 'pretrained' not in opt: return cls(opt['n_vocab'], d_word_vec=opt['d_word_vec'], d_model=opt['d_model'], len_max_seq=opt['len_max_seq'], n_layer=opt['n_layer'], d_inner=opt['d_inner'], n_head=opt['n_head'], slf_attn=opt['slf_attn'], d_k=opt['d_k'], d_v=opt['d_v'], feat_vocab=opt['feat_vocab'], d_feat_vec=opt['d_feat_vec'], layer_attn=opt['layer_attn'], slf_attn_mask=opt['mask_slf_attn'], dropout=opt['dropout'], attn_dropout=opt['attn_dropout']) elif opt['pretrained'].count('bert'): pretrained = BertModel.from_pretrained(opt['pretrained']) return cls(opt['n_vocab'], pretrained=pretrained, layer_attn=opt['layer_attn'], model_name='bert') elif opt['pretrained'].count('gpt2'): pretrained = GPT2Model.from_pretrained(opt['pretrained']) return cls(opt['n_vocab'], pretrained=pretrained, model_name='gpt2') else: raise ValueError("Other pretrained models haven't been supported yet")
def __init__(self, cuda_device=-1): super(GPT2Embedder, self).__init__() self.cuda_device = 'cpu' if cuda_device == -1 else f'cuda:{cuda_device}' # Load pre-trained model tokenizer (vocabulary) self.enc = GPT2Tokenizer.from_pretrained('gpt2') # Load pre-trained model (weights) self.model = GPT2Model.from_pretrained('gpt2') self.model.to(self.cuda_device) self.model.eval( ) # we only use the evaluation mode of the pretrained model self._bos_id = self.enc.encoder['<|endoftext|>'] self._bos_past = None
def __init__(self, config): super(GPT2NeuralNet, self).__init__(config) self.gpt2 = GPT2Model(config) self.dropout = nn.Dropout(0.3) dense_size = config.n_embd * 2 # 全连接层 self.linear1 = nn.Linear(config.n_embd * 2, dense_size) self.linear2 = nn.Linear(config.n_embd * 2, dense_size) self.linear_gate = nn.Linear(config.n_embd * 2 + dense_size, config.n_embd * 2) # 输出层 self.linear_out = nn.Linear(dense_size, 1) self.linear_aux_out = nn.Linear(dense_size, 5) self.linear_identity_out = nn.Linear(dense_size, 9) self.linear_np_out = nn.Linear(dense_size, 4) self.linear_identity_hidden = nn.Linear(config.n_embd * 2, dense_size) self.apply(self.init_weights)
def fetch_objects(): bert = BertModel.from_pretrained( 'bert-base-uncased').embeddings.position_embeddings.weight.data gpt = OpenAIGPTModel.from_pretrained( 'openai-gpt').positions_embed.weight.data gpt2 = GPT2Model.from_pretrained('gpt2').wpe.weight.data bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') gpt_tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') return { 'bert': bert, 'gpt': gpt, 'gpt2': gpt2 }, { 'bert': bert_tokenizer, 'gpt': gpt_tokenizer, 'gpt2': gpt2_tokenizer }
def transform(self, X): # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Load pre-trained model (weights) model = GPT2Model.from_pretrained('gpt2', cache_dir='tmp/gpt2/') model.eval() output = [] for idx, row in tqdm(X.iterrows(), total=len(X)): # Encode some inputs indexed_tokens_1 = tokenizer.encode(row.text) # If you have a GPU, put everything on cuda # Convert inputs to PyTorch tensors tokens_tensor_1 = torch.tensor([indexed_tokens_1]) tokens_tensor_1 = tokens_tensor_1.to('cuda') model.to('cuda') # Predict hidden states features for each layer with torch.no_grad(): hidden_states_1, past = model(tokens_tensor_1) tokens = [ tokenizer.decoder[token].replace('Ġ', '') for token in indexed_tokens_1 ] output.append([tokens, hidden_states_1.cpu()[0]]) output = pd.DataFrame(output, columns=['tokens', 'layer_-1']) res = [] for idx, row in X.iterrows(): res.append(self.get_sample_props(output.loc[idx], **row)[1:]) res = pd.DataFrame(res, columns=[ 'tokens', 'pronoun_offset_token', 'a_offset_token', 'b_offset_token', 'a_span', 'b_span', 'pronoun_token', 'a_tokens', 'b_tokens', 'bert', 'cls' ]) cols = set(X.columns).difference(res.columns) return {'X': pd.concat([X[cols], res], axis=1)}
def extract_gpt2_hidden_word_representations( word, save_activs_to): #, mode="full_model", focus_layers=[]): # text_inputs = [] # with open(text_path, "rb") as infile: # text_inputs = pickle.load(infile) # num_inputs = len(text_inputs) # Load pre-trained model tokenizer (vocabulary) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # get the hidden activations for word -- assumes gpu is available word_vec = None # initialize word vector object # tokenize word indexed_tokens = tokenizer.encode(word) num_tokens = len(indexed_tokens) tokens_tensor = torch.tensor([indexed_tokens]).to('cuda') # set up model model = GPT2Model.from_pretrained('gpt2') model.eval() model.to('cuda') # get word_vec with torch.no_grad(): # get token-wise activations hidden, _ = model(tokens_tensor) hidden_np = hidden.cpu().numpy() # identify hidden layer dimension that represents different tokens # seq_dim = hidden_np.shape.index(num_tokens) seq_dim = 1 # we know that the dimension corresponding to tokens is the 2nd dimension, indexed by 1 # sum the hidden layer element-wise along the token dimension to get word vector representation word_vec = np.sum(hidden_np, axis=seq_dim).squeeze() # clear gpu memory torch.cuda.empty_cache() # save word vector with open(save_activs_to, "wb") as outfile: pickle.dump(word_vec, outfile) pass
def load_model_fromlist(name): if not name in MODELS: raise Exception(str(name) + ' not a model in the list') print('# Loading model: ' + str(name)) name_path = MODEL_PATH_DICT[name] if name == 'word2vec': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format(join( PATH, name_path), binary=True)) if name == 'glove': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format( join(PATH, name_path))) if name == 'dict2vec': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format( join(PATH, name_path), binary=False, unicode_errors="ignore")) if name == 'conceptnet': if not exists(join(PATH, name_path)): download_model(name) return (gensim.models.KeyedVectors.load_word2vec_format( join(PATH, name_path))) if name == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = BertModel.from_pretrained( 'bert-large-uncased').embeddings.word_embeddings.weight.data.numpy( ) return ([model, tokenizer]) if name == 'bert-context': tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = BertModel.from_pretrained('bert-large-uncased', output_hidden_states=True) return ([model, tokenizer]) if name == 'gpt2': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained( 'gpt2').transformer.wte.weight.data.numpy() return ([model, tokenizer]) if name == 'gpt2-context': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2', output_hidden_states=True) return ([model, tokenizer])
def __init__(self, model_name, add_dense=True, trainable=False): super().__init__() self.model_name = model_name self.add_dense = add_dense self.trainable = trainable if self.model_name == 'GPT': self.encoder = OpenAIGPTModel.from_pretrained('openai-gpt') elif self.model_name == 'GPT-2': self.encoder = GPT2Model.from_pretrained('gpt2') else: raise NotImplementedError(f'{self.model_name} -- No such model') if not self.trainable: for p in self.encoder.parameters(): p.requires_grad = False if self.add_dense: self.dense = nn.Linear(in_features=768, out_features=128)
def Get_GPT2_Representation(self, examples): for i, example in enumerate(examples): # example.gpt2_mat = np.zeros((pb.fgt_maxlength,768)) # continue if (self.gpt2_tokenizer == None): self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') text = example.fgt_channels[0] indexed_tokens = self.gpt2_tokenizer.encode(text) tokens_tensor = torch.tensor([indexed_tokens]) if (self.gpt2 == None): self.gpt2 = GPT2Model.from_pretrained('gpt2') self.gpt2.eval() with torch.no_grad(): hidden_states, past = self.gpt2(tokens_tensor) # (1, 5, 768) shape = np.array(hidden_states).shape representation, sum = [], 0 a, b = shape[1], shape[2] representation = np.zeros((a, b)) for layer in hidden_states: for words in layer.numpy(): representation += words sum += 1 if (sum > 0): representation = representation * 1.0 / sum representation = list(representation) while (len(representation) < pb.fgt_maxlength): representation.append(np.zeros(b)) example.gpt2_mat = representation[0:pb.fgt_maxlength] print("{:.2%}".format(i * 1.0 / len(examples)))
def test_model_from_pretrained(self): cache_dir = "/tmp/pytorch_pretrained_bert_test/" for model_name in list(PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model)
def _init_model_tokenizer(self): from pytorch_pretrained_bert import GPT2Model, GPT2Tokenizer self._tokenizer = GPT2Tokenizer.from_pretrained(self.model_dir) self._model = GPT2Model.from_pretrained(self.model_dir) self._model.eval()
def __init__(self, n_layers, in_size, out_size, embed_size, in_size_hier, hidden_size, proj_size, dropout=0.5, initialEmbW=None, independent=False, rnn_type='lstm', classifier='baseline', states_att=False, state_size=-1, embedding_init=None, weights_init=None, elmo_init=False, elmo_num_outputs=1, finetune_elmo=False, bert_init=False, bert_model=None, finetune_bert=False, add_word_emb=True, pretrained_all=True): """Initialize encoder with structure parameters Args: n_layers (int): Number of layers. in_size (int): Dimensionality of input vectors. out_size (int): Dimensionality of output vectors. embed_size (int): Dimensionality of word embedding. hidden_size (int) : Dimensionality of hidden vectors. proj_size (int) : Dimensionality of projection before softmax. dropout (float): Dropout ratio. """ #TODO att_size = 128 self.rnn_type = rnn_type self.classifier = classifier super(HLSTMDecoder, self).__init__() self.embed = nn.Embedding(in_size, embed_size) if embedding_init is not None: self.embed.weight.data.copy_(torch.from_numpy(embedding_init)) elif weights_init is not None: self.embed.weight.data.copy_( torch.from_numpy(weights_init['embed'])) if rnn_type == 'lstm': self.lstm = nn.LSTM(embed_size + in_size_hier, hidden_size, n_layers, batch_first=True, dropout=dropout) elif rnn_type == 'gru': self.lstm = nn.GRU(embed_size + in_size_hier, hidden_size, n_layers, batch_first=True, dropout=dropout) if weights_init is not None: lstm_wt = weights_init['lstm'] for k, v in lstm_wt.items(): self.lstm.__getattr__(k).data.copy_(torch.from_numpy(v)) self.elmo_init = elmo_init self.bert_init = bert_init self.pretrained_all = pretrained_all self.bert_model = bert_model self.add_word_emb = add_word_emb if False: #if pretrained_all and elmo_init: options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" self.elmo = Elmo(options_file, weight_file, elmo_num_outputs, requires_grad=finetune_elmo) elmo_layer = [ nn.Linear(elmo_num_outputs * 1024, out_size), nn.ReLU() ] self.elmo_layer = nn.Sequential(*elmo_layer) elif False: #elif pretrained_all and bert_init: if 'bert' in bert_model: self.bert = BertModel.from_pretrained(bert_model) elif 'openai-gpt' in bert_model: self.bert = OpenAIGPTModel.from_pretrained(bert_model) elif 'gpt2' in bert_model: self.bert = GPT2Model.from_pretrained(bert_model) elif 'transfo-xl' in bert_model: self.bert = TransfoXLModel.from_pretrained(bert_model) self.finetune_bert = finetune_bert if not finetune_bert: for param in self.bert.parameters(): param.requires_grad = False if bert_model in ['bert-base-uncased', 'openai-gpt', 'gpt2']: bert_in = 768 elif bert_model in [ 'bert-large-uncased', 'gpt2-medium', 'transfo-xl-wt103' ]: bert_in = 1024 bert_layer = [nn.Linear(bert_in, out_size), nn.ReLU()] self.bert_layer = nn.Sequential(*bert_layer) self.n_layers = n_layers self.dropout = dropout self.independent = independent self.states_att = states_att if states_att: self.ecW = nn.Linear(state_size, att_size) self.ysW = nn.Linear(hidden_size, att_size) hidden_size += state_size if classifier == 'baseline': layers = [ nn.Linear(hidden_size, proj_size), nn.Linear(proj_size, out_size) ] self.y_classifier = nn.Sequential(*layers) elif classifier == 'weighted_norm': layers = [ weight_norm(nn.Linear(hidden_size, proj_size), dim=None), nn.ReLU(), weight_norm(nn.Linear(proj_size, out_size), dim=None) ] self.y_classifier = nn.Sequential(*layers) elif classifier == 'logit': layers = [ weight_norm(nn.Linear(hidden_size, proj_size), dim=None), nn.ReLU(), nn.Linear(proj_size, out_size) ] self.classifier_txt = nn.Sequential(*layers) layers = [ weight_norm(nn.Linear(hidden_size, 2048), dim=None), nn.ReLU(), nn.Linear(2048, out_size) ] self.classifier_ft = nn.Sequential(*layers) if weights_init is not None: self.classifier_txt[0].weight.data.copy_( torch.from_numpy(weights_init['classifier_txt'])) self.classifier_ft[0].weight.data.copy_( torch.from_numpy(weights_init['classifier_ft']))
print(tokenizer.decode(indexed_tokens_1)) # Who was Jim Henson ? print(tokenizer.decode([8727, 373, 474, 320])) # who was jim print(tokenizer.decode([30963, 1559, 5633])) # henson ? print(tokenizer.decode([30963])) # hen print(tokenizer.decode([508, 8727, 373])) # whowho was indexed_tokens_1 = tokenizer.convert_tokens_to_ids(text_1) # AttributeError: 'GPT2Tokenizer' object has no attribute 'convert_tokens_to_ids' indexed_tokens_2 = tokenizer.encode(text_2); print(indexed_tokens_2) # [18050, 367, 19069, 373, 257, 13595, 14471, 263] ## Convert inputs to PyTorch tensors tokens_tensor_1 = torch.tensor([indexed_tokens_1]); print(tokens_tensor_1) tokens_tensor_2 = torch.tensor([indexed_tokens_2]) ################################################################## ## GPT2Model ## Load pre-trained model (weights) model = GPT2Model.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/gpt2/') model.eval() # INFO:pytorch_pretrained_bert.modeling_gpt2:Model config { # "initializer_range": 0.02, # "layer_norm_epsilon": 1e-05, # "n_ctx": 1024, # "n_embd": 768, # "n_head": 12, # "n_layer": 12, # "n_positions": 1024, # "vocab_size": 50257 # } ## Predict hidden states features for each layer with torch.no_grad(): hidden_states_1, past = model(tokens_tensor_1)
output = output.reshape(output.shape[0], -1, output.shape[1]) output = np.swapaxes(output, 0, 1) list_output.append(output) # ====== Construct Cache ====== # temp_cache = {} for i, sent in enumerate(mini_batch): hask_key = hashlib.sha256(sent.encode()).hexdigest() temp_cache[hask_key] = output[i] self.cache.update(temp_cache) idx += mini_batch_size self.count += mini_batch_size output = np.concatenate(list_output, 0) te = time.time() print('encoding with model', len(sentences), 'processed', self.count, 'took', '{:4.1f}'.format(te - ts)) te = time.time() embedding = self.get_multi_head_embedding(output, heads, head_size) return embedding if __name__ == '__main__': model = GPT2Model('bert-base-uncased') model.prepare('Length') a.construct_encoder()
def __init__(self, n_wlayers, n_slayers, in_size, out_size, embed_size, hidden_size, dropout=0.5, ignore_label=None, initialEmbW=None, independent=False, rnn_type='lstm', embedding_init=None, weights_init=None, elmo_init=False, elmo_num_outputs=1, finetune_elmo=False, bert_init=False, bert_model=None, finetune_bert=False, add_word_emb=True, pretrained_all=True, concat_his=False): """Initialize encoder with structure parameters Args: n_layers (int): Number of layers. in_size (int): Dimensionality of input vectors. out_size (int) : Dimensionality of hidden vectors to be output. embed_size (int): Dimensionality of word embedding. dropout (float): Dropout ratio. """ super(HLSTMEncoder, self).__init__() self.embed = nn.Embedding(in_size, embed_size) if embedding_init is not None: self.embed.weight.data.copy_(torch.from_numpy(embedding_init)) elif weights_init is not None: self.embed.weight.data.copy_( torch.from_numpy(weights_init['embed'])) if rnn_type == 'lstm': self.wlstm = nn.LSTM(embed_size, hidden_size, n_wlayers, batch_first=True, dropout=dropout) self.slstm = nn.LSTM(hidden_size, out_size, n_slayers, batch_first=True, dropout=dropout) elif rnn_type == 'gru': self.wlstm = nn.GRU(embed_size, hidden_size, n_wlayers, batch_first=True, dropout=dropout) self.slstm = nn.GRU(hidden_size, out_size, n_slayers, batch_first=True, dropout=dropout) self.elmo_init = elmo_init self.bert_init = bert_init self.pretrained_all = pretrained_all self.concat_his = concat_his self.bert_model = bert_model self.add_word_emb = add_word_emb if pretrained_all and elmo_init: options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" self.elmo = Elmo(options_file, weight_file, elmo_num_outputs, requires_grad=finetune_elmo) elmo_layer = [ nn.Linear(elmo_num_outputs * 1024, out_size), nn.ReLU() ] self.elmo_layer = nn.Sequential(*elmo_layer) elif pretrained_all and bert_init: if 'bert' in bert_model: self.bert = BertModel.from_pretrained(bert_model) elif 'openai-gpt' in bert_model: self.bert = OpenAIGPTModel.from_pretrained(bert_model) elif 'gpt2' in bert_model: self.bert = GPT2Model.from_pretrained(bert_model) elif 'transfo-xl' in bert_model: self.bert = TransfoXLModel.from_pretrained(bert_model) self.finetune_bert = finetune_bert if not finetune_bert: for param in self.bert.parameters(): param.requires_grad = False if bert_model in ['bert-base-uncased', 'openai-gpt', 'gpt2']: bert_in = 768 elif bert_model in [ 'bert-large-uncased', 'gpt2-medium', 'transfo-xl-wt103' ]: bert_in = 1024 bert_layer = [nn.Linear(bert_in, out_size), nn.ReLU()] self.bert_layer = nn.Sequential(*bert_layer) self.independent = independent self.rnn_type = rnn_type
def __init__(self, n_layers, in_size, out_size, embed_size, dropout=0.5, initialEmbW=None, rnn_type='lstm', attention=None, q_size=-1, embedding_init=None, weights_init=None, elmo_init=False, elmo_num_outputs=1, finetune_elmo=False, bert_init=False, bert_model=None, finetune_bert=False, add_word_emb=True): """Initialize encoder with structure parameters Args: n_layers (int): Number of layers. in_size (int): Dimensionality of input vectors. out_size (int) : Dimensionality of hidden vectors to be output. embed_size (int): Dimensionality of word embedding. dropout (float): Dropout ratio. """ # TODO conv_out_size = 512 super(LSTMEncoder, self).__init__() self.embed = nn.Embedding(in_size, embed_size) if embedding_init is not None: self.embed.weight.data.copy_(torch.from_numpy(embedding_init)) elif weights_init is not None: self.embed.weight.data.copy_( torch.from_numpy(weights_init['embed'])) self.elmo_init = elmo_init self.bert_init = bert_init self.bert_model = bert_model self.add_word_emb = add_word_emb if elmo_init: options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" self.elmo = Elmo(options_file, weight_file, elmo_num_outputs, requires_grad=finetune_elmo) elmo_layer = [ nn.Linear(elmo_num_outputs * 1024, out_size), nn.ReLU() ] self.elmo_layer = nn.Sequential(*elmo_layer) elif bert_init: if 'bert' in bert_model: self.bert = BertModel.from_pretrained(bert_model) elif 'openai-gpt' in bert_model: self.bert = OpenAIGPTModel.from_pretrained(bert_model) elif 'gpt2' in bert_model: self.bert = GPT2Model.from_pretrained(bert_model) elif 'transfo-xl' in bert_model: self.bert = TransfoXLModel.from_pretrained(bert_model) self.finetune_bert = finetune_bert if not finetune_bert: for param in self.bert.parameters(): param.requires_grad = False if bert_model in ['bert-base-uncased', 'openai-gpt', 'gpt2']: bert_in = 768 elif bert_model in [ 'bert-large-uncased', 'gpt2-medium', 'transfo-xl-wt103' ]: bert_in = 1024 bert_layer = [nn.Linear(bert_in, out_size), nn.ReLU()] self.bert_layer = nn.Sequential(*bert_layer) if rnn_type == 'lstm': self.lstm = nn.LSTM(embed_size, out_size, n_layers, batch_first=True, dropout=dropout) elif rnn_type == 'gru': self.lstm = nn.GRU(embed_size, out_size, n_layers, batch_first=True, dropout=dropout) self.attention = attention if attention == 'conv' or attention == 'conv_sum': conv_in_size = out_size self.conv1 = nn.Conv1d(in_channels=conv_in_size, out_channels=conv_out_size, kernel_size=1, padding=0) self.conv2 = nn.Conv1d(in_channels=conv_out_size, out_channels=2, kernel_size=1, padding=0) if weights_init is not None: self.conv1.weight.data.copy_( torch.from_numpy(weights_init['conv1'])) self.conv2.weight.data.copy_( torch.from_numpy(weights_init['conv2'])) elif attention == 'c_conv_sum': hidden_size = 512 conv_hidden_size = 256 layers = [ weight_norm(nn.Linear(out_size, hidden_size), dim=None), nn.ReLU() ] self.c_fa = nn.Sequential(*layers) layers = [ weight_norm(nn.Linear(q_size, hidden_size), dim=None), nn.ReLU() ] self.q_fa = nn.Sequential(*layers) layers = [ nn.Conv2d(in_channels=hidden_size, out_channels=conv_hidden_size, kernel_size=1), nn.ReLU(), nn.Conv2d(in_channels=conv_hidden_size, out_channels=1, kernel_size=1) ] self.cq_att = nn.Sequential(*layers) if weights_init is not None: self.c_fa[0].weight.data.copy_( torch.from_numpy(weights_init['c_fa'])) self.q_fa[0].weight.data.copy_( torch.from_numpy(weights_init['q_fa'])) self.cq_att[0].weight.data.copy_( torch.from_numpy(weights_init['cq_att_conv1'])) self.cq_att[2].weight.data.copy_( torch.from_numpy(weights_init['cq_att_conv2']))
def main(): """Main training program.""" print('Evaluate GPT2 model') # Disable CuDNN. torch.backends.cudnn.enabled = False # Timer. timers = Timers() # Arguments. args = get_args() # Pytorch distributed. initialize_distributed(args) # Random seeds for reproducability. set_random_seed(args.seed) # Data stuff. eval_data = get_eval_data(args) # Model, optimizer, and learning rate. if args.eval_hf: from pytorch_pretrained_bert import GPT2LMHeadModel from pytorch_pretrained_bert import GPT2Model as HFGPT2Model if args.num_layers == 24: model_path = args.load #model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M' hfmodel = HFGPT2Model.from_pretrained(model_path, cache_dir='gpt2_weights', from_tf=True).cuda() model = GPT2LMHeadModel(hfmodel.config) model.transformer.load_state_dict(hfmodel.state_dict()) model.cuda() else: model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='gpt2_weights').cuda() else: if args.load_openai: from utils import move_weights model_path = args.load args.load = None model = setup_model(args) from pytorch_pretrained_bert import GPT2LMHeadModel from pytorch_pretrained_bert import GPT2Model as HFGPT2Model model_path = 'gpt2' from_tf = False print('loading openai weights') model.cpu() if args.num_layers == 24: #model_path = '/home/universal-lm-data.cosmos549/repos/gpt2_mp/models/345M' hfmodel = HFGPT2Model.from_pretrained(model_path, cache_dir='gpt2_weights', from_tf=True) gpt2model = GPT2LMHeadModel(hfmodel.config) gpt2model.transformer.load_state_dict(hfmodel.state_dict()) gpt2model else: gpt2model = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir='gpt2_weights') model2fill = model while isinstance(model2fill, (DDP, FP16_Module)): model2fill = model2fill.module move_weights(model2fill, gpt2model) model.cuda() else: model = setup_model(args) # Run on test data. prefix = "wiki" #os.path.basename(args.valid_data) evaluate_and_print_results(prefix, eval_data, model, args, timers)