def mine_triples(device, input_file, output_file, use_local_model=False): if use_local_model: print('loading BERT...') bert = BertForMaskedLM.from_pretrained("../models/BertForMaskedLM") print('loading GPT2...') gpt = GPT2LMHeadModel.from_pretrained("../models/GPT2LMHeadModel") else: print('loading BERT...') bert = BertForMaskedLM.from_pretrained(bert_model) print('loading GPT2...') gpt = GPT2LMHeadModel.from_pretrained(gpt2_model) """ 'concat': KnowledgeMiner( os.path.join(data_repo, candidate_file), device, DirectTemplate, bert ), 'template': KnowledgeMiner( os.path.join(data_repo, candidate_file), device, PredefinedTemplate, bert, grammar=False, template_loc=os.path.join(template_repo, single_templates) ), 'template_grammar': KnowledgeMiner( os.path.join(data_repo, candidate_file), device, PredefinedTemplate, bert, grammar=True, template_loc=os.path.join(template_repo, single_templates) ), """ knowledge_miners = { 'coherency': KnowledgeMiner(input_file, device, EnumeratedTemplate, bert, language_model=gpt, template_loc=os.path.join(template_repo, multiple_templates), use_local_model=use_local_model) } for template_type in knowledge_miners.keys(): predictions = run_experiment(template_type, knowledge_miners) triples = knowledge_miners[template_type].sentences.tuples scored_samples = list(zip(triples, predictions)) scored_samples.sort(key=lambda x: x[1], reverse=True) with open(output_file, "w") as f: for triple, pred in scored_samples: rel, head, tail = triple triple = (rel.lower(), head, tail) f.write("\t".join(triple) + "\t" + "{:.5f}".format(pred)) f.write("\n")
class Classifier(torch.nn.Module): def __init__(self, hidden_size=768, linear_out=2, batch_first=True): super(Classifier, self).__init__() self.output_model_file = "lm/pytorch_model.bin" self.output_config_file = "lm/config.json" self.tokenizer = BertTokenizer.from_pretrained("lm", do_lower_case=False) self.config = BertConfig.from_json_file(self.output_config_file) self.model = BertForMaskedLM(self.config) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') self.state_dict = torch.load(self.output_model_file, map_location=device) self.model.load_state_dict(self.state_dict) self.lstm = torch.nn.LSTM(hidden_size, 300) self.linear = torch.nn.Linear(300, linear_out) def get_embeddings(self, x_instance): indexed_tokens = x_instance.tolist() break_sentence = indexed_tokens.index(102) tokens_tensor = torch.tensor([indexed_tokens]) segments_ids = [0] * (break_sentence + 1) segments_ids += [1] * (len(indexed_tokens) - break_sentence - 1) segments_tensors = torch.tensor([segments_ids]) self.model.eval() with torch.no_grad(): encoded_layers, _ = self.model.bert(tokens_tensor.to(device), segments_tensors.to(device)) token_embeddings = torch.stack(encoded_layers, dim=0) token_embeddings = torch.squeeze(token_embeddings, dim=1) token_embeddings = token_embeddings.permute(1, 0, 2) token_vecs_cat = [] for token in token_embeddings: cat_vec = torch.stack((token[-1], token[-2], token[-3], token[-4])) mean_vec = torch.mean(cat_vec, 0) token_vecs_cat.append(mean_vec) token_vecs_cat = torch.stack(token_vecs_cat, dim=0) return token_vecs_cat def embed_data(self, x): entries = [] for entry in x: emb = self.get_embeddings(entry.to(device)).to(device) entries.append(emb) return torch.stack(entries) def forward(self, x): h = self.embed_data(x) h = h.permute(1, 0, 2) output, _ = self.lstm(h) pred = self.linear(output) pred = pred.permute(1, 0, 2) return pred
def create_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForMaskedLM(config=config) model.eval() loss = model(input_ids, token_type_ids, input_mask, token_labels) prediction_scores = model(input_ids, token_type_ids, input_mask) outputs = { "loss": loss, "prediction_scores": prediction_scores, } return outputs
def get_words_for_blank_slow_decode(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer): random.seed(42) np.random.seed(42) torch.manual_seed(42) mask_positions = [] tokenized_text = tokenizer.tokenize(text) top_words_all = [] for i in range(len(tokenized_text)): if tokenized_text[i] == '_': tokenized_text[i] = '[MASK]' mask_positions.append(i) while mask_positions: top_words = [] # Convert tokens to vocab indices token_ids = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([token_ids]) # Call BERT to calculate unnormalized probabilities for all pos model.eval() predictions = model(tokens_tensor) # get predictions mask_preds = predictions[0, mask_positions, :] candidates = [] #(word, prob) for mask_pos in mask_positions: mask_preds = predictions[0, mask_pos, :] top_idxs = mask_preds.detach().numpy().argsort()[::-1] top_idx = top_idxs[0] top_prob = mask_preds[top_idx] top_word = tokenizer.ids_to_tokens[top_idx] candidates.append((top_word, top_prob.detach().item())) top_words_pos = [] for i in top_idxs[:20]: top_words_pos.append((tokenizer.ids_to_tokens[i], mask_preds[i].detach().item())) top_words.append(top_words_pos) best_candidate = max(candidates, key = lambda x: x[1]) best_pos = mask_positions[candidates.index(best_candidate)] tokenized_text[best_pos] = best_candidate[0] mask_positions = [i for i in mask_positions if i != best_pos] top_words_all.append(top_words[candidates.index(best_candidate)]) pred_sent = ' '.join(tokenized_text).replace(' ##', '') return (pred_sent, top_words_all)
def predict_missing_word(sentence): tokenized_text = tokenizer.tokenize(sentence) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Create the segments tensors. segments_ids = [0] * len(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.eval() # Predict all tokens with torch.no_grad(): predictions = model(tokens_tensor, segments_tensors) masked_index = tokenized_text.index('[MASK]') predicted_index = torch.argmax(predictions[0, masked_index]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] return predicted_token
def __init__(self, top_k, bert_name): self.do_lower_case = "uncased" in bert_name self.top_k = top_k self.tokenizer = BertTokenizer.from_pretrained( bert_name, do_lower_case=self.do_lower_case) self.model = BertForMaskedLM.from_pretrained(bert_name) self.model.eval()
def __init__(self, factorize=True): self.model = BertForMaskedLM.from_pretrained('bert-base-uncased') self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.weight_of_phrase = [] self.weight_of_position = [] self.weight_average = [] self.factorize = factorize
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification", base_model=None, base_tokenizer=None, device="cuda", chinese=False): # Load pre-trained model (weights) if base_model is None: # Download from huggingface if chinese: base_model = "bert-base-chinese" else: base_model = "bert-base-uncased" if model_type == "BertForSequenceClassification": model = BertForSequenceClassification.from_pretrained(base_model) # Load pre-trained model tokenizer (vocabulary) elif model_type == "BertForNextSentencePrediction": model = BertForNextSentencePrediction.from_pretrained(base_model) elif model_type == "BertForMaskedLM": model = BertForMaskedLM.from_pretrained(base_model) else: print("[Error]: unsupported model type") return None, None if base_tokenizer is None: # Download from huggingface tokenizer = BertTokenizer.from_pretrained(base_model) else: # Load local vocab file tokenizer = BertTokenizer.from_pretrained(base_tokenizer) model.to(device) return model, tokenizer
def __init__(self, model): # tokenizer self.tokenizer = BertTokenizer.from_pretrained(model) # Model self.bertModel = BertForMaskedLM.from_pretrained(model) self.bertModel.eval()
def do_ai_madlib(text_with_blanks, blank_token): mask_token = '[MASK]' bert_version = 'bert-base-cased' model = BertForMaskedLM.from_pretrained(bert_version) tokenizer = BertTokenizer.from_pretrained(bert_version) tokens = tokenizer.tokenize(text_with_blanks) mask_idxs = [] for i in range(0, len(tokens)): if tokens[i] == blank_token: tokens[i] = mask_token mask_idxs.append(i) model.eval() for i in mask_idxs: # convert tokens to their index in the "vocabulary" token_ids = tokenizer.convert_tokens_to_ids(tokens) # create a tensor for these indices tokens_tensor = torch.tensor([token_ids]) preds = model(tokens_tensor)[0,i] pred_id = torch.argmax(preds).item() pred_token = tokenizer.convert_ids_to_tokens([pred_id])[0] tokens[i] = pred_token for i in mask_idxs: tokens[i] = '__' + tokens[i] + '__' return ' '.join(tokens).replace(' ##', '')
def generate_syntactically_similar_sentences_replace(num_of_perturb, dataset): """Generate syntactically similar sentences for each sentence in the dataset. For PaInv-Replace Returns dictionary of original sentence to list of generated sentences """ # Use nltk treebank tokenizer and detokenizer tokenizer = TreebankWordTokenizer() detokenizer = TreebankWordDetokenizer() # Stopwords from nltk stopWords = list(set(stopwords.words('english'))) # File from which sentences are read file = open(dataset, "r") # when we use Bert berttokenizer = BertTokenizer.from_pretrained('bert-large-uncased') bertmodel = BertForMaskedLM.from_pretrained('bert-large-uncased') bertmodel.eval() # Number of perturbations you want to make for a word in a sentence dic = {} num_of_perturb = 50 num_sent = 0 for line in file: s_list = line.split("\n") source_sent = s_list[0] # Generating new sentences using BERT new_sents = perturb(source_sent, bertmodel, num_of_perturb) dic[line] = new_sents if new_sents != []: num_sent += 1 return dic
def example_get_lm(tokens_tensor, segments_tensors, tokenizer): '''how to use BertForMaskedLM''' # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.eval() # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to('cuda') segments_tensors = segments_tensors.to('cuda') model.to('cuda') # Predict all tokens with torch.no_grad(): predictions = model(tokens_tensor, segments_tensors) masked_index = 8 # confirm we were able to predict 'henson' predicted_index = torch.argmax(predictions[0, masked_index]).item() print("predicted_index") print(predicted_index) predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] print("predicted_token") print(predicted_token) # assert predicted_token == 'henson' return
def predict(): # Load pre-trained model with masked language model head bert_version = 'bert-large-uncased' model = BertForMaskedLM.from_pretrained(bert_version) # Preprocess text tokenizer = BertTokenizer.from_pretrained(bert_version) tokenized_text = tokenizer.tokenize(text[idx]) mask_positions = [] for i in range(len(tokenized_text)): if tokenized_text[i] == '_': tokenized_text[i] = '[MASK]' mask_positions.append(i) # Predict missing words from left to right model.eval() for mask_pos in mask_positions: # Convert tokens to vocab indices token_ids = tokenizer.convert_tokens_to_ids(tokenized_text) tokens_tensor = torch.tensor([token_ids]) # Call BERT to predict token at this position predictions = model(tokens_tensor)[0, mask_pos] predicted_index = torch.argmax(predictions).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] # Update text tokenized_text[mask_pos] = predicted_token for mask_pos in mask_positions: tokenized_text[mask_pos] = "_" + tokenized_text[mask_pos] + "_" madlib = (' '.join(tokenized_text).replace(' ##', '')) bottom = Text(madlibsframe, height=10, width=50, wrap=WORD) bottom.configure(font=("Times New Roman", 18, "bold")) bottom.insert(END, madlib) bottom.pack()
def guess_single_word(text): tokenized_text = tokenizer.tokenize(text) print(tokenized_text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) print(indexed_tokens) masked_index = tokenized_text.index('[MASK]') print(masked_index) # Create the segments tensors. segments_ids = [0] * len(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) print(tokens_tensor, segments_tensors) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained( pretrained_model_name_or_path=pretrained_model_path) model.eval() # Predict all tokens with torch.no_grad(): predictions = model(tokens_tensor, segments_tensors) print(predictions.shape) pre_idxs = get_top_n_idx(predictions[0, masked_index], 5) print(pre_idxs) print(tokenizer.convert_ids_to_tokens(np.asarray(pre_idxs))) predicted_index = torch.argmax(predictions[0, masked_index]).item() print(predicted_index) predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] print(predicted_token)
def __init__(self, model_path, tokenizer_path): super(Bert, self).__init__() self.model_path = model_path self.tokenizer_path = tokenizer_path self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path) self.model = BertForMaskedLM.from_pretrained(model_path)
def __init__(self, segment_size, output_size, dropout): super(BertPunc, self).__init__() self.bert = BertForMaskedLM.from_pretrained('bert-base-uncased') self.bert_vocab_size = 30522 self.bn = nn.BatchNorm1d(segment_size * self.bert_vocab_size) self.fc = nn.Linear(segment_size * self.bert_vocab_size, output_size) self.dropout = nn.Dropout(dropout)
def mine_from_wikipedia(hardware): print('loading BERT...') bert = BertForMaskedLM.from_pretrained(bert_model) print('loading GPT2...') gpt = GPT2LMHeadModel.from_pretrained(gpt2_model) knowledge_miners = { 'concat': KnowledgeMiner(data_repo + wikipedia_candidates, hardware, DirectTemplate, bert), 'template': KnowledgeMiner(data_repo + wikipedia_candidates, hardware, PredefinedTemplate, bert, grammar=False, template_loc=template_repo + single_templates), 'template_grammar': KnowledgeMiner(data_repo + wikipedia_candidates, hardware, PredefinedTemplate, bert, grammar=True, template_loc=template_repo + single_templates), 'coherency': KnowledgeMiner(data_repo + wikipedia_candidates, hardware, EnumeratedTemplate, bert, language_model=gpt, template_loc=template_repo + multiple_templates) } for template_type in knowledge_miners.keys(): run_experiment(template_type, knowledge_miners)
def __init__(self, use_gpu=False): self.tokenizer = BertTokenizer.from_pretrained(DEFAULT_BERT_WEIGHTS) self.model = BertForMaskedLM.from_pretrained(DEFAULT_BERT_WEIGHTS) self.model.eval() use_gpu = use_gpu and torch.cuda.is_available() self.device = torch.device("cuda" if use_gpu else "cpu") self.model.to(self.device)
def __init__(self, hidden_size=768, linear_out=2, batch_first=True): super(Classifier, self).__init__() self.output_model_file = "lm/pytorch_model.bin" self.output_config_file = "lm/config.json" self.tokenizer = BertTokenizer.from_pretrained("lm", do_lower_case=False) self.config = BertConfig.from_json_file(self.output_config_file) self.model = BertForMaskedLM(self.config) device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') self.state_dict = torch.load(self.output_model_file, map_location=device) self.model.load_state_dict(self.state_dict) self.lstm = torch.nn.LSTM(hidden_size, 300) self.linear = torch.nn.Linear(300, linear_out)
def __init__(self): self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") self.bertmodel = 'bert-large-uncased' self.tokenizer = BertTokenizer.from_pretrained(self.bertmodel) self.model = BertForMaskedLM.from_pretrained(self.bertmodel).to( self.device) self.model.eval()
def initialize_bert_corrector(self): t1 = time.time() self.bert_tokenizer = BertTokenizer(self.bert_model_vocab) # Prepare model self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir) print("Loaded model: %s, vocab file: %s, spend: %.3f s." % (self.bert_model_dir, self.bert_model_vocab, time.time() - t1)) self.initialized_bert_corrector = True
def load_model(modeldir): # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained(modeldir) # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained(modeldir) model.eval() model.to('cuda') return model, tokenizer
def __init__(self, train_data=None, dev_data=None): super().__init__(train_data, dev_data) model_name = 'bert-base-uncased' self.bert = BertForMaskedLM.from_pretrained(model_name) self.bert.to('cuda') self.tokenizer = tokenization.BertTokenizer.from_pretrained(model_name) self.bert.eval()
def init(self, model_type, model_dir): self.model_type = model_type self.model_dir = model_dir self.tokenizer = BertTokenizer.from_pretrained(model_type, cache_dir=model_dir) self.model = BertForMaskedLM.from_pretrained(model_type, cache_dir=model_dir) self.model.eval() if self.gpu: self.model.to("cuda")
def __init__(self, weight_name='bert-base-uncased'): self.tokenizer = BertTokenizerFast.from_pretrained(weight_name, do_lower_case=True) self.model = BertForMaskedLM.from_pretrained(weight_name) self.loss_fct = torch.nn.CrossEntropyLoss() self.device = self.get_device() self.model = self.model.to(self.device) self.model.eval()
def load_model(device): global model, modelp model = BertModel.from_pretrained('bert-base-uncased') model.eval() model.to(device) modelp = BertForMaskedLM.from_pretrained('bert-base-uncased') modelp.eval() modelp.to(device)
def __init__(self, model_name: str = 'bert-base-uncased', do_lower_case: bool = True): # Load pre-trained model tokenizer (vocabulary) self.tokenizer = BertTokenizer.from_pretrained( model_name, do_lower_case=do_lower_case) # Load pre-trained model (weights) self.model = BertForMaskedLM.from_pretrained(model_name) self.model.eval()
def load_bert(): global bert_tok, bert if bert is None: bert_model_str = os.getenv( 'BERT_MODEL', default='bert-base-uncased' ) # 'bert-base-uncased', 'bert-base-multilingual-uncased' bert_tok = BertTokenizer.from_pretrained(bert_model_str) bert = BertForMaskedLM.from_pretrained(bert_model_str) bert.eval()
def initialize_bert_corrector(self): t1 = time.time() self.bert_tokenizer = BertTokenizer(self.bert_model_vocab) self.MASK_ID = self.bert_tokenizer.convert_tokens_to_ids([MASK_TOKEN ])[0] # Prepare model self.model = BertForMaskedLM.from_pretrained(self.bert_model_dir) logger.debug("Loaded model ok, path: %s, spend: %.3f s." % (self.bert_model_dir, time.time() - t1)) self.initialized_bert_corrector = True
def loadBERT(): global tokenizer global model print("Loading BERT") # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Load pre-trained model (weights) model = BertForMaskedLM.from_pretrained('bert-base-uncased') model.eval() print("Done")
model = BertModel.from_pretrained(home + '/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/') model.eval() ## Predict hidden states features for each layer print(tokens_tensor.shape) # torch.Size([1, 14]) with torch.no_grad(): encoded_layers, _ = model(tokens_tensor, segments_tensors) ## We have a hidden states for each of the 24 layers in model bert-large-uncased print(len(encoded_layers)) # 24 print(encoded_layers[0].shape) # torch.Size([1, 14, 1024]) x = torch.LongTensor([[1, 2], [3, 4]]); print(x.shape) # torch.Size([2, 2]) print(modelfj) ################################################################## ## BertForMaskedLM model = BertForMaskedLM.from_pretrained('/Users/coder352/datasets/WordVec/pytorch_pretrained_bert/bert-large-uncased/') model.eval() ## Predict all tokens with torch.no_grad(): predictions = model(tokens_tensor, segments_tensors) print(predictions.shape) # torch.Size([1, 14, 30522]) ## confirm we were able to predict 'henson' predicted_index = torch.argmax(predictions[0, masked_index]).item(); print(predicted_index) # 27227 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index]) print(predicted_token) # ['henson'] ################################################################## ## OpenAI GPT2 ##################################################################