def __init__(self, embedding_dir, model_name="bert-base-multilingual-cased", layer=-2): super(BertEncoder, self).__init__(embedding_dir) # Load pre-trained model (weights) and set to evaluation mode (no more training) self.model = BertModel.from_pretrained(model_name) self.model.eval() # Load word piece tokenizer self.tokenizer = BertTokenizer.from_pretrained(model_name) # Layer from which to get the embeddings self.layer = layer
def __init__(self, tag_size, top_rnns=False, device='cpu', finetuning=False): super().__init__() self.bert = BertModel.from_pretrained('bert-base-chinese') self.top_rnns = top_rnns if top_rnns: self.rnn = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=768 // 2, batch_first=True) self.fc = nn.Linear(768, tag_size) self.device = device self.finetuning = finetuning
def __init__(self, config, static=False): super(TextCNN, self).__init__() model_bert = bertModel.from_pretrained('bert-base-uncased') pre_trained_embed = model_bert.embeddings.word_embeddings.weight D = pre_trained_embed.shape[1] C = config.hidden_size Ci = 1 Co = config.cnn_kernel_num Ks = config.cnn_kernel_sizes self.embed = nn.Embedding.from_pretrained(pre_trained_embed) self.convs = nn.ModuleList([nn.Conv2d(Ci, Co, (K, D)) for K in Ks]) self.dropout = nn.Dropout(config.cnn_dropout_prob) self.fc1 = nn.Linear(len(Ks) * Co, C) if static: self.embed.weight.requires_grad = False
def __init__(self, top_rnns=False, vocab_size=None, device='cpu', finetuning=False): super().__init__() self.bert = BertModel.from_pretrained(config.Config.bert_model) self.top_rnns = top_rnns if top_rnns: self.rnn = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=768 // 2, batch_first=True) #[128, 74, 768] self.fc = nn.Linear(768, vocab_size) self.device = device self.finetuning = finetuning
def main(): args = parser.parse_args() tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') model = BertModel.from_pretrained('bert-large-uncased') model.eval().cuda(args.gpu) datadir = args.data pir_dataset = product_image_retrieval( datadir, args.set, ) loader = torch.utils.data.DataLoader(pir_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=False) length_dir = len( pd.read_csv(os.path.join(datadir, 'splitted', args.set) + '.csv')) feats = np.zeros((length_dir, 1024)) text = 'pattern recognition is so hard for me' for i, (texts, index) in tqdm(enumerate(loader)): # if args.gpu is not None: # text = text.cuda(args.gpu, non_blocking=True) tokens_lst = [] for text in texts: tokenized_text = tokenizer.tokenize(text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_lst.append(indexed_tokens) tokens_tensor = torch.tensor(tokens_lst).cuda(args.gpu, non_blocking=True) feats_batch = model(tokens_tensor, output_all_encoded_layers=False)[1] feats_batch = feats_batch.detach().cpu().numpy() for idx in range(len(index)): feats[index[idx], :] = feats_batch[idx, :] np.save('expr/features_bert' + args.name + '_' + args.set + '.npy', feats)
def __init__(self, pretrained_model_name_or_path, num_labels, Encoder1, is_lock=False): super(ClassifyModel, self).__init__() self.bert = BertModel.from_pretrained(pretrained_model_name_or_path) self.Encoder = Encoder1 self.classifier = nn.Linear(768, num_labels) self.init_mBloSA() self.s2tSA = s2tSA(768) if is_lock: for name, param in self.bert.named_parameters(): if name.startswith('pooler'): continue else: param.requires_grad_(False)
def __init__(self, tag_to_ix, hidden_dim=768): super(Bert_BiLSTM_CRF, self).__init__() self.tag_to_ix = tag_to_ix self.tagset_size = len(tag_to_ix) # self.hidden = self.init_hidden() self.lstm = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=hidden_dim//2, batch_first=True) self.transitions = nn.Parameter(torch.randn( self.tagset_size, self.tagset_size )) self.hidden_dim = hidden_dim self.start_label_id = self.tag_to_ix['[CLS]'] self.end_label_id = self.tag_to_ix['[SEP]'] self.fc = nn.Linear(hidden_dim, self.tagset_size) self.bert = BertModel.from_pretrained('/root/workspace/qa_project/chinese_L-12_H-768_A-12') # self.bert.eval() # 知用来取bert embedding self.transitions.data[self.start_label_id, :] = -10000 self.transitions.data[:, self.end_label_id] = -10000 self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def __init__(self, max_seq_length, max_q_length, max_a_length, embedding_dim=768, prev_history=2, use_gpu=False, bert_model=None): super(BaseModel, self).__init__() self.bert_model = BertModel.from_pretrained('bert-base-uncased') if bert_model != None: self.bert_model.load_state_dict(bert_model) if True: for name, param in self.bert_model.named_parameters(): if "encoder.layer.11" not in name: param.requires_grad = False self.max_seq_length = max_seq_length self.max_q_length = max_q_length self.max_a_length = max_a_length self.embedding_dim = embedding_dim self.prev_history = prev_history self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.bi_gru_layer1 = torch.nn.GRU( (prev_history * 2 + 1) * embedding_dim, embedding_dim, batch_first=True, bidirectional=True) self.linear_start = torch.nn.Linear( (prev_history * 2 + 3) * embedding_dim, 1) self.softmax = torch.nn.Softmax(dim=1) self.bi_gru_layer2 = torch.nn.GRU(2 * embedding_dim, embedding_dim, batch_first=True, bidirectional=True) self.linear_end = torch.nn.Linear( (prev_history * 2 + 3) * embedding_dim, 1) self.answer_type_layer = torch.nn.Linear( (prev_history * 2 + 3) * embedding_dim, 3) self.CUDA = torch.cuda.is_available() and use_gpu
def BERT(textA, textB): # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') # Tokenized input tokenized_textA = tokenizer.tokenize(textA) tokenized_textB = tokenizer.tokenize(textB) # Mask a token that we will try to predict back with `BertForMaskedLM` masked_index = len(tokenized_textA) + tokenized_textB.index('$') tokenized_text = tokenized_textA + tokenized_textB tokenized_text[masked_index] = '[MASK]' #print(tokenized_textB) # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [0] * len(tokenized_textA) + [1] * len(tokenized_textB) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model = BertModel.from_pretrained('bert-large-uncased') model.eval() # Predict hidden states features for each layer encoded_layers, _ = model(tokens_tensor, segments_tensors) # We have a hidden states for each of the 12 layers in model bert-base-uncased # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained('bert-large-uncased') model.eval() # Predict all tokens predictions = model(tokens_tensor, segments_tensors) # confirm we were able to predict predicted_index = torch.argmax(predictions[0, masked_index]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] return predicted_token
def customEmbeddingTest(): global sortedWordSents bertModel = BertModel.from_pretrained('/media/yuan/Samsung_T5/Documents/BERT/bert-base-chinese') bertModel.eval() # sortedWordSents = sorted(origWordSents, key=lambda e: len(e), reverse=True) # print(sortedWordSents) # wordsSents = ["[CLS]" + sent + "[SEP]" for sent in origWordSents] # print(wordsSents) tokenizedSents = [tokenizer.tokenize(sent) for sent in origWordSents] origSentsLens = [len(sent) for sent in tokenizedSents] tokenizedSents = [["[CLS]"] + sent + ["[SEP]"] for sent in tokenizedSents] targetedWordIdxs = [sent.index('还') for sent in tokenizedSents] wordIdxs = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenizedSents] #maxLen = len(wordIdxs[0]) maxLen = max([len(sent) for sent in wordIdxs]) paddedInputIds = [sent + [0] * (maxLen - len(sent)) for sent in wordIdxs] paddedInputIds = torch.tensor(paddedInputIds) attentionMask = torch.tensor([[float(i > 0) for i in sent] for sent in paddedInputIds]) allLayerEmbeds, _ = bertModel(paddedInputIds, attention_mask=attentionMask, output_all_encoded_layers=True) with torch.no_grad(): concatSentEmbeds = [] layerIdxs = [-1, -2, -3, -4] for sentIdx, sent in enumerate(tokenizedSents): selectedLayersForSent = [] for tokenIdx in range(maxLen): selectedLayersForToken = [] if tokenIdx == 0 or tokenIdx == origSentsLens[sentIdx] + 1: continue for layerIdx in layerIdxs: layerEmbeds = allLayerEmbeds[layerIdx].detach().cpu()[sentIdx] selectedLayersForToken.append(layerEmbeds[tokenIdx]) selectedLayersForSent.append(torch.cat(selectedLayersForToken)) concatSentEmbeds.append(torch.stack(selectedLayersForSent)) return torch.stack(concatSentEmbeds)
def prepare(params, samples): if params['cache'] is None: # check whether cache is already provided params['cache'] = load_cache(params.model_name, params.current_task, params.cache_path) # try to load cache if params['cache'] is None: # if there is no cache saved, then construct encoder model print("Constructing Encoder Model") params['cache'] = {} # ====== Construct Model ====== # model = BertModel.from_pretrained(args.model_name) model = torch.nn.DataParallel(model) tokenizer = BertTokenizer.from_pretrained(args.model_name, do_lower_case=True) params['model'] = model params['tokenizer'] = tokenizer params['flag_save'] = True # ====== Initializ Counter ====== # params['count'] = 0
def __init__(self, opt): super(bert_att_mis, self).__init__() self.opt = opt self.model_name = 'bert_att_mis' self.test_scale_p = 0.5 self.bert_model = BertModel.from_pretrained(opt.bert_path) self.bert_model.cuda() self.bags_feature = [] rel_dim = opt.rel_dim self.rel_embs = nn.Parameter(torch.randn(self.opt.rel_num, rel_dim)) self.rel_bias = nn.Parameter(torch.randn(self.opt.rel_num)) self.dropout = nn.Dropout(self.opt.drop_out) self.init_model_weight()
def __init__(self): super(Model4, self).__init__() self.bert = BertModel.from_pretrained( r'D:\bert_weight_Chinese\chinese_L-12_H-768_A-12\bert-base-chinese.tar' ) for param in self.bert.parameters(): param.requires_grad = False self.conv1 = nn.Conv2d( 1, 100, kernel_size=(1, 16 * 768), stride=1) # params: 输入通道数,输出通道数(filter个数),核视野(H,W),步长 self.conv2 = nn.Conv2d(1, 100, kernel_size=(2, 16 * 768), stride=1) self.conv3 = nn.Conv2d(1, 100, kernel_size=(3, 16 * 768), stride=1) self.conv4 = nn.Conv2d(1, 100, kernel_size=(4, 16 * 768), stride=1) self.conv5 = nn.Conv2d(1, 100, kernel_size=(5, 16 * 768), stride=1) self.dp1 = nn.Dropout(0.1) self.dense1 = nn.Linear(500 * 4, 500 * 2) self.dense2 = nn.Linear(500 * 2, 200) self.dense3 = nn.Linear(200, 2)
def vectorize(self, sentence): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') marked_text = "[CLS] " + sentence + " [SEP]" #print(marked_text) tokenized_text = tokenizer.tokenize(marked_text) #print(tokenized_text) segments_ids = [1] * len(tokenized_text) #print(segments_ids) segments_tensors = torch.tensor([segments_ids]) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) #print(tokens_tensor) model = BertModel.from_pretrained('bert-base-uncased') with torch.no_grad(): encoded_layers, _ = model(tokens_tensor, segments_tensors) sentence_embedding = torch.mean(encoded_layers[11], 1) #print(sentence_embedding) return sentence_embedding
def BERT_initializer(line1): tokenized_text = tokenizer.tokenize(line1) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = [1] * len(tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) model = BertModel.from_pretrained('bert-base-multilingual-cased') # Put the model in "evaluation" mode, meaning feed-forward operation. model.eval() # Predict hidden states features for each layer with torch.no_grad(): encoded_layers, _ = model(tokens_tensor, segments_tensors) token_embedding = token_embeddings(encoded_layers) return token_embedding, tokenized_text, segments_ids
def query_encow(query_id, sentences, N=5000): # GPU available? CUDA = torch.cuda.is_available() # initialize the bert model print(f"Initializing BERT model {'with' if CUDA else 'without'} CUDA...", end='') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') if CUDA: model = model.to('cuda') model.eval() print(" OK.") aggregated_pairs = [] for i in range(80): vecs, sents = read_pickle(f'encow/encow_sent.txt.{str(i).zfill(3)}') aggregated_pairs += query(model, tokenizer, vecs, sents, sentences, CUDA) print("Querying " + f'({i + 1}/80)' + ('.' * ((i % 3) + 1)) + ' ', end='\r') return query_id, sorted([(sim, sent[6:-6]) for sim, sent in aggregated_pairs], reverse=True, key=lambda x: x[0])[:N]
def main(): df = pd.read_pickle("../data/contextual_similarity_df") print ("Initiating CURTIS!...") pretrained_model = 'bert-base-uncased' tokenizer = initialize_tokenizer(pretrained_model) model = BertModel.from_pretrained(pretrained_model) # starting feed forward network model.eval() print ("CURTIS: Hey Kaushik, I am happy to have you here again!") _ = input("CURTIS: How are you?\nKaushik: ") user_question = input("CURTIS: You are having a rough time \nCURTIS: What makes you feel like this?\nKaushik: ") user_question_context = input("CURTIS: Please provide more context for your problem\nKaushik: ") user_vec = get_contextual_vector(model, tokenizer, user_question, user_question_context) for i in range(df.shape[0]): try: cs_dist = cosine(user_vec, df.loc[i, "contextual_vector"]) except: cs_dist = 1 df.loc[i, "similarity_score"] = 1 - cs_dist print ("CURTIS:", df[df.similarity_score == df.similarity_score.max()].reset_index().reflection[0])
def define_module(self): self.encoder = BertModel.from_pretrained('bert-base-uncased') for param in self.encoder.parameters(): param.requires_grad = False self.bert_linear = nn.Linear(768, self.ninput) self.drop = nn.Dropout(self.drop_prob) if self.rnn_type == 'LSTM': # dropout: If non-zero, introduces a dropout layer on # the outputs of each RNN layer except the last layer self.rnn = nn.LSTM(self.ninput, self.nhidden, self.nlayers, batch_first=True, dropout=self.drop_prob, bidirectional=self.bidirectional) elif self.rnn_type == 'GRU': self.rnn = nn.GRU(self.ninput, self.nhidden, self.nlayers, batch_first=True, dropout=self.drop_prob, bidirectional=self.bidirectional) else: raise NotImplementedError
def __init__(self): with open('data/sent_example.pickle', 'rb') as handle: self.sent_example_map = pickle.load(handle) self.target_embedding_map = {} self.wikilinks_embedding_map = {} self.target_output_embedding_map = {} self.wikilinks_output_embedding_map = {} self.stop_sign = "STOP_SIGN_SIGNAL" self.db_loaded = False self.load_sqlite_db('data/bert_cache_2.db') self.server_mode = False # Load pre-trained model (weights) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.model = BertModel.from_pretrained('bert-base-uncased') self.model.eval() # If you have a GPU, put everything on cuda self.model.to('cuda')
def __init__(self, dropout, output_dim): """ Args: embedding_matrix: Pre-trained word embeddings embedding_dim: Embedding dimension of the word embeddings vocab_size: Size of the vocabulary hidden_dim: Size hiddden state dropout: Dropout probability output_dim: Output classes (Subtask A: 2 = (OFF, NOT)) """ super(BertPooling, self).__init__() self.bert = BertModel.from_pretrained('bert-base-uncased') for param in self.bert.parameters(): param.requires_grad = False self.classifier = nn.Linear(768, output_dim) self.dropout = nn.Dropout(dropout) nn.init.xavier_normal_(self.classifier.weight)
def __init__(self, top_rnns=False, vocab_size=None, device='cpu', finetuning=False): super().__init__() self.bert = BertModel.from_pretrained( '/root/workspace/qa_project/chinese_L-12_H-768_A-12') self.top_rnns = top_rnns if top_rnns: self.rnn = nn.LSTM(bidirectional=True, num_layers=2, input_size=768, hidden_size=768 // 2, batch_first=True) #[128, 74, 768] self.fc = nn.Linear(768, vocab_size) self.device = device self.finetuning = finetuning
def __init__(self, opt): self.opt = opt if 'bert' in opt.model_name: tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) bert = BertModel.from_pretrained(opt.pretrained_bert_name) self.model = opt.model_class(bert, opt).to(opt.device) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer) if self.opt.do_predict is True: self.predictset = ABSADataset(opt.dataset_file['predict'], tokenizer) assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: self.valset = self.testset # tmp setting self.testset = self.valset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) self._print_args()
def __init__(self, emb_size, hidden_size, vocab_size, nlayers=1, bidirectional=True, rec_unit='LSTM', dropout=0.5): """ Based on https://github.com/komiya-m/MirrorGAN/blob/master/model.py :param emb_size: size of word embeddings :param hidden_size: size of hidden state of the recurrent unit :param vocab_size: size of the vocabulary (output of the network) :param rec_unit: type of recurrent unit (default=gru) """ self.dropout = dropout self.nlayers = nlayers self.bidirectional = bidirectional self.num_directions = 2 if self.bidirectional else 1 __rec_units = { 'GRU': nn.GRU, 'LSTM': nn.LSTM, } assert rec_unit in __rec_units, 'Specified recurrent unit is not available' super().__init__(emb_size) self.hidden_linear = nn.Linear(emb_size, hidden_size) self.encoder = BertModel.from_pretrained('bert-base-uncased') for param in self.encoder.parameters(): param.requires_grad = False self.bert_linear = nn.Linear(768, emb_size) self.rnn = __rec_units[rec_unit](emb_size, hidden_size, num_layers=self.nlayers, batch_first=True, dropout=self.dropout, bidirectional=self.bidirectional) self.out = nn.Linear(self.num_directions * hidden_size, vocab_size)
def __init__(self, model_path='bert-base-uncased', length=None, cased=False): self.length = length self.cased = cased if cased == True: model_path = 'bert-base-cased' self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) else: self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.model = BertForMaskedLM.from_pretrained(model_path) self.base_model = BertModel.from_pretrained(model_path) self.model.eval() self.base_model.eval() self.vocab = dict(self.tokenizer.vocab) sw_vocab = stop_words.intersection(set(self.vocab.keys())) self.sw_indecies = self.tokenizer.convert_tokens_to_ids(list(sw_vocab)) puncs = list(string.punctuation) self.puncs_indecies = self.tokenizer.convert_tokens_to_ids(puncs)
def __init__(self, model_name, layer, use_cache=False): super().__init__() if 'bert' in globals(): self.bert = globals()['bert'] else: self.bert = BertModel.from_pretrained(model_name) globals()['bert'] = self.bert for p in self.bert.parameters(): p.requires_grad = False self.layer = layer if 'large' in model_name: n_layer = 24 else: n_layer = 12 if self.layer == 'weighted_sum': self.weights = nn.Parameter(torch.ones(n_layer, dtype=torch.float)) self.softmax = nn.Softmax(0) if use_cache: self._cache = {} else: self._cache = None
def load_bert(base_version=True, lower_case=True, device=None): if base_version: embedding_dim = BERT_BASE_EMBEDDING_DIM if lower_case: bert_name = 'bert-base-uncased' else: bert_name = 'bert-base-cased' else: embedding_dim = BERT_LARGE_EMBEDDING_DIM if lower_case: bert_name = 'bert-large-uncased' else: bert_name = 'bert-large-cased' # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained(bert_name) # Load pre-trained model (weights) model = BertModel.from_pretrained(bert_name) return tokenizer, model, device, embedding_dim
def __init__(self, config): super().__init__() self.batchSize = config['model']['batchSize'] self.dropout = nn.Dropout(config['model']['dropout']) self.device = config['DEVICE'] #选取的特征数量 self.featureLen = config['model']['featureLen'] self.hiddenSize = config['model']['hiddenSize'] self.embeddingSize = 768 self.positionEncoding = PositionalEncoding(self.embeddingSize, dropout = 0.1) self.bertModel = BertModel.from_pretrained(config['model']['bert_base_chinese']) self.layer = nn.TransformerEncoderLayer(d_model = self.embeddingSize, nhead = 4) self.encoder = nn.TransformerEncoder(self.layer, num_layers=2) self.cnnArr = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=self.hiddenSize//self.featureLen, kernel_size=(i, self.embeddingSize)) for i in range(2, 2+ self.featureLen)]) self.fc = nn.Linear(self.hiddenSize, len(tagDict))
def __init__(self, config, ft, num_labels, H, device_num, C, c_adj, alpha): super(BertGCN_Cluster, self).__init__(config) self.device = torch.device('cuda:' + device_num) self.bert = BertModel.from_pretrained('bert-base-uncased') self.dropout = nn.Dropout(0.5) self.ft = ft self.alpha = alpha self.H = get_tensor(H, self.device) # m * 3072 self.C = get_tensor(C, self.device) # m * C self.c_adj = gen_adj(get_tensor(c_adj, self.device)).detach() # C * C self.num_labels = num_labels self.FCN = nn.Linear(768, num_labels) self.FCN_gcn = nn.Linear(768, 768) self.FCN_H = nn.Linear(H.shape[1], 768) self.actv = nn.LeakyReLU(0.2) # self.actv = nn.Tanh() self.softmax = nn.Softmax(dim=1) self.apply(self.init_bert_weights) self.W1 = Parameter(torch.Tensor(H.shape[1], 1536)) self.W2 = Parameter(torch.Tensor(1536, 768))
def main(): path = os.path.join("data", "LJSpeech-1.1") preprocess_ljspeech(path) model_bert = BertModel.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') text_path = os.path.join(hp.dataset_path, "train.txt") texts = process_text(text_path) if not os.path.exists(hp.bert_embeddings_path): os.mkdir(hp.bert_embeddings_path) for ind, text in enumerate(texts): character = text[0:len(text)-1] bert_embedding = get_embedding(character, model_bert, tokenizer) np.save(os.path.join(hp.bert_embeddings_path, str(ind) + ".npy"), bert_embedding.numpy(), allow_pickle=False) if (ind+1) % 100 == 0: print("Done", (ind+1))
def __init__(self, trigger_size=None, entity_size=None, all_postags=None, postag_embedding_dim=50, argument_size=None, entity_embedding_dim=50, device=torch.device("cpu")): super().__init__() # self.bert = BertModel.from_pretrained('bert-base-cased') self.bert = BertModel.from_pretrained(bert_model_path) # hidden_size = 768 + entity_embedding_dim + postag_embedding_dim # hidden_size = 768 hidden_size = 768 * 3 self.fc1 = nn.Sequential( # nn.Dropout(0.5), nn.Linear(hidden_size, hidden_size, bias=True), nn.ReLU(), ) self.fc_trigger = nn.Sequential( nn.Linear(hidden_size, trigger_size), ) self.fc_argument = nn.Sequential( nn.Linear(hidden_size * 2, argument_size), ) self.device = device
def __init__(self, emb_size, hidden_size, out_rel, n_layers=1, dropout=0.1, emb_drop=0.2, gpu=False, pretrained_emb=None, train_emb=True): super(EncoderRNN, self).__init__() self.gpu = gpu # self.input_size = input_size self.hidden_size = hidden_size self.n_layers = n_layers self.dropout = dropout self.use_cuda = gpu # self.b_size = b_size self.emb_size = emb_size # self.embedding = nn.Embedding(vocab_size, emb_size, padding_idx=0) self.bert = BertModel.from_pretrained('bert-base-uncased') self.entity_classifier = nn.Linear(emb_size, out_rel) # self.bert.weight.requires_grad = False self.embedding_dropout = nn.Dropout(emb_drop) self.rnn = nn.LSTM(emb_size, hidden_size, n_layers, dropout=self.dropout, batch_first=True) if pretrained_emb is not None: self.embedding.weight.data.copy_(pretrained_emb) if train_emb == False: self.embedding.weight.requires_grad = False
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) print(tokenizer.convert_tokens_to_ids('hello')) # [1044, 1041, 1048, 1048, 1051] print(tokenizer.convert_tokens_to_ids(['hello'])) # [7592] print(tokenizer.convert_tokens_to_ids(['[hello]'])) # KeyError: '[hello]'; can not deal with OOV print(indexed_tokens) # [101, 2040, 2001, 3958, 27227, 1029, 102, 3958, 103, 2001, 1037, 13997, 11510, 102] ## Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] # tokenized_text 分为两句, 前 7 个词一句, 后七个词一句 ################################################################## ## BertModel ## Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]); print(tokens_tensor.shape) # torch.Size([1, 14]) segments_tensors = torch.tensor([segments_ids]) ## Load pre-trained model (weights) model = BertModel.from_pretrained(home + '/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/') model.eval() ## Predict hidden states features for each layer print(tokens_tensor.shape) # torch.Size([1, 14]) with torch.no_grad(): encoded_layers, _ = model(tokens_tensor, segments_tensors) ## We have a hidden states for each of the 24 layers in model bert-large-uncased print(len(encoded_layers)) # 24 print(encoded_layers[0].shape) # torch.Size([1, 14, 1024]) x = torch.LongTensor([[1, 2], [3, 4]]); print(x.shape) # torch.Size([2, 2]) print(modelfj) ################################################################## ## BertForMaskedLM model = BertForMaskedLM.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/')