def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForNextSentencePrediction(config=config) model.eval() loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels) result = { "loss": loss, "seq_relationship_score": seq_relationship_score, } self.parent.assertListEqual( list(result["seq_relationship_score"].size()), [self.batch_size, 2]) self.check_loss_output(result)
def add_sc(data): print("Computing Semantic Coherence") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased") softmax = torch.nn.Softmax(dim=1) model.eval() output = [] for ex in tqdm(data): summary = ex["summary"] scores = [] sentences = sent_tokenize(summary) if len(sentences) <= 1: ex["coherence"] = 1 else: numerator = 0 denominator = len(sentences) - 1 for i in range(len(sentences) - 1): prev = sentences[i] curr = sentences[i + 1] s = "[CLS] " + prev + " [SEP] " + curr + " [SEP]" tokenized_text = tokenizer.tokenize(s) boundary = tokenized_text.index("[SEP]") segment_ids = [0] * boundary + [1] * (len(tokenized_text) - boundary) indexed_tokens = tokenizer.convert_tokens_to_ids( tokenized_text) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segment_ids]) with torch.no_grad(): prediction = model(tokens_tensor, token_type_ids=segments_tensors)[0] prediction_sm = softmax(prediction)[0].tolist() if prediction_sm[0] > 0.5: numerator += 1 ex["coherence"] = numerator / denominator
def main(): model = BertForNextSentencePrediction.from_pretrained('bert-base-cased') tokenizer = BertTokenizer.from_pretrained('bert-base-cased') seq_A = 'I like cookies !' seq_B = 'Do you like them ?' probs = bert_seq(model, tokenizer, seq_A, seq_B) print(probs)
def __init__(self, bert_device): #self.tokenizer = tokenizer = BertTokenizer('./models/vocab.txt', do_lower_case=True) #self.model = BertForSequenceClassification.from_pretrained('./models/', cache_dir=None, from_tf=False, state_dict=None).to("cuda:0") self.bert_device = bert_device self.model = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased').cuda(self.bert_device) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.max_seq_len = 128 #TODO: Dont hard code this
def load(self, fname=None): if fname is not None: self.load_path = fname if self.pretrained_bert and not Path(self.pretrained_bert).is_file(): self.model = BertForNextSentencePrediction.from_pretrained( self.pretrained_bert, output_attentions=False, output_hidden_states=False) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.bert_config = BertConfig.from_json_file(str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob self.model = BertForNextSentencePrediction(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device)
def __init__(self, categories: list): # load pretrained BERT self.categories = categories # Load pre-trained model tokenizer self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.model = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased') self.model.eval()
def main(): path = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/' path_new = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/' analysis = text_analysis() # analysis.read_realtion(path) analysis.read_videoinfo(path_new) # questions=analysis.gather_question() question = analysis.video_question # for item in question: # print(question[item]['quizzes'][0].keys()) """ self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer' multiple-choices open-ended """ scripts = analysis.gather_transcripts(path) temp_dic = analysis.build_question_transcripts(path_new) temp = [] for item in temp_dic: print(item) for quiz in temp_dic[item]['questions']: if quiz['question_type'] == 'multiple-choices': temp.append(temp_dic[item]) break new_element = get_question_hint_sentence_x(temp[1]) #for question in temp[-1]['questions']: # print(question) # #print(temp[-1]) #print(len(temp)) tokenizer = BertTokenizer.from_pretrained('bert-large-cased') model = BertForNextSentencePrediction.from_pretrained('bert-large-cased') #tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased') #model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased') print(temp[1]) for i in range(len(new_element['question'])): first_sentence = new_element['question'][i]['quiz_description'].strip( ' ') next_sentences = new_element['question'][i]['responding_candidate'] x = [] for sentence in next_sentences: encoding = tokenizer(first_sentence, sentence, return_tensors='pt') outputs = model(**encoding, labels=torch.LongTensor([1])) logits = outputs.logits probs = softmax(logits, dim=1)[0][0].item() x.append(probs) a = argmax(x) #print(next_sentences) print(first_sentence) print(next_sentences) print(new_element['question'][i]['video_answer_hinted']) break
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): from transformers import BertForNextSentencePrediction, BertTokenizer # download model self.path_model= download_model('bert.botxo.pytorch', cache_dir, process_func=_unzip_process_func,verbose=verbose) # Load pre-trained model tokenizer self.tokenizer = BertTokenizer.from_pretrained(self.path_model) # Load pre-trained model (weights) self.model = BertForNextSentencePrediction.from_pretrained(self.path_model, output_hidden_states = True, # Whether the model returns all hidden-states. )
def test_get_probability_of_next_sentence(self): tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = BertForNextSentencePrediction.from_pretrained('bert-base-cased') text1 = "How old are you?" text2 = "The Eiffel Tower is in Paris" text3 = "I am 22 years old" prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2) prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3) assert_almost_equal(prob1, 0.0149559) assert_almost_equal(prob2, 0.9997911)
def test_get_probability_of_next_sentence_portuguesee(self): tokenizer = BertTokenizer.from_pretrained('models/neuralmind/bert-base-portuguese-cased') model = BertForNextSentencePrediction.from_pretrained('models/neuralmind/bert-base-portuguese-cased') text1 = "Quantos anos você tem?" text2 = "A Torre Eiffel fica em Paris" text3 = "Eu tenho 22 anos" prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2) prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3) assert_almost_equal(prob1, 0.5229671) assert_almost_equal(prob2, 0.9979677)
def get_nsp_model_and_optimizer(params, device): model_name = params["model_name"] assert model_name[:4] == "bert", f"Non-BERT models not supported for NSP. Supplied model name was {model_name}" nsp_model = BertForNextSentencePrediction.from_pretrained(model_name) nsp_model = nsp_model.to(device) optimizer = get_weighted_adam_optimizer(nsp_model, params) return nsp_model, optimizer
def __init__(self, **kwargs): """ Initialized the BERT model :param batch_size: [int] batch size to used for bert """ super().__init__() self.batch_size = kwargs['batch_size'] self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.model = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased') self.model.eval() if torch.cuda.is_available(): self.model.cuda()
def create_and_check_for_next_sequence_prediction( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = BertForNextSentencePrediction(config=config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels, ) self.parent.assertEqual(result.logits.shape, (self.batch_size, 2))
def __init__(self, surprise_weight: float = 2.0, sentence_weight: float = 1.0): self.tokenizer = BertTokenizer.from_pretrained( self.PRETRAINED_MODEL_NAME) self.sentence_model = BertForNextSentencePrediction.from_pretrained( self.PRETRAINED_MODEL_NAME) self.sentence_model.eval() self.language_model = BertForMaskedLM.from_pretrained( self.PRETRAINED_MODEL_NAME) self.language_model.eval() self.surprise_weight = surprise_weight self.sentence_weight = sentence_weight
def __init__(self): self.filename = 'name' self.verbose_response = True self.output = "" #self.kernel = aiml_std.Kernel() self.tree = None self.root = None self.l = [] self.score = [] self.memory = {} self.index = -1 self.incomplete = False self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.model = BertForNextSentencePrediction.from_pretrained( 'bert-base-uncased')
def prep_for_training(num_training_steps): if args.model == "Albert": model = AlbertForDebateSequenceClassification.from_pretrained( "albert-base-v2", newly_added_config=args) tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") elif args.model == "BertNSP": model = BertForNextSentencePrediction.from_pretrained( "bert-base-uncased") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") else: raise ValueError("request model is not available") model.to(DEVICE) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(num_training_steps * args.warmup_ratio), num_training_steps=num_training_steps, ) return model, optimizer, scheduler, tokenizer
def task_2(): # 任务二:句子预测任务 sample_1 = ("今天天气怎么样", "今天天气很好") sample_2 = ("小明今年几岁了", "我不喜欢学习") tokenizer = BertTokenizer.from_pretrained(bert_path) sen_code = tokenizer.batch_encode_plus( [sample_1, sample_2]) # 上下句结合可以这样传参 List[Tuple[str, str]] input_ids = torch.tensor(sen_code["input_ids"]) model = BertForNextSentencePrediction.from_pretrained(bert_path) model.eval() outputs = model(input_ids) seq_relationship_scores = outputs.logits # torch.Size([batch, 2]) # pred_lst = seq_relationship_scores.max(dim=1).indices # torch.Size([batch, 2]) pred_lst = seq_relationship_scores.argmax(axis=1) # torch.Size([batch, 2]) for pred in pred_lst: print(f"预测结果:{pred}") # 0表示是上下句,1表示不是上下句(第二句明明不是前后句关系,不知道为什么会输出0)
def create_and_check_bert_for_next_sequence_prediction( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForNextSentencePrediction(config=config) model.to(torch_device) model.eval() result = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels, ) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, 2]) self.check_loss_output(result)
def test_get_probability_of_next_sentence(self): tokenizer = BertTokenizer.from_pretrained('bert-base-cased') model = BertForNextSentencePrediction.from_pretrained( 'bert-base-cased') wrong_sentence = "How old are you? The Eiffel Tower is in Paris" correct_sentence = "How old are you? I am 22 years old" full, partial = analyse_single_wsc_bert(model, tokenizer, correct_sentence, wrong_sentence) assert full assert partial == 0 full, partial = analyse_single_wsc_bert(model, tokenizer, wrong_sentence, correct_sentence) assert not full assert partial == 0
def test_get_probability_of_next_sentence_multilingual(self): tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') model = BertForNextSentencePrediction.from_pretrained('bert-base-multilingual-cased') text1 = "How old are you?" text2 = "The Eiffel Tower is in Paris" text3 = "I am 22 years old" prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2) prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3) assert_almost_equal(prob1, 0.5525756) assert_almost_equal(prob2, 0.9784408) text1 = "Quantos anos você tem?" text2 = "A Torre Eiffel fica em Paris" text3 = "Eu tenho 22 anos" prob1 = get_probability_of_next_sentence(tokenizer, model, text1, text2) prob2 = get_probability_of_next_sentence(tokenizer, model, text1, text3) assert_almost_equal(prob1, 0.8567284) assert_almost_equal(prob2, 0.9410717)
def __init__(self, extractor, config): super().__init__() self.config = config if config["hidden"] == 0: self.combine = nn.Linear(config["topk"], 1, bias=False) with torch.no_grad(): self.combine.weight = nn.Parameter( torch.ones_like(self.combine.weight) / config["topk"]) else: assert config["hidden"] > 0 self.combine = nn.Sequential( nn.Linear(config["topk"], config["hidden"]), nn.ReLU(), nn.Linear(config["hidden"], 1)) # original model file (requires apex): # state = torch.load("/GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/saved.msmarco_mb_1", map_location="cpu") # self.bert = state["model"] # saved.msmarco_mb_1 weights exported from the official apex model: # self.bert = BertForNextSentencePrediction.from_pretrained("bert-large-uncased") # self.bert.load_state_dict(torch.load("/GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/converted")) # converted_weights.msmarco_mb # kevin's base model: # self.bert = BertForNextSentencePrediction.from_pretrained("bert-base-uncased") # saved_bert = torch.load("/GW/NeuralIR/nobackup/birch/models/saved.tmp_1")["model"] # self.bert.load_state_dict(saved_bert.state_dict()) # also /GW/NeuralIR/nobackup/birch-emnlp_bert4ir_v2/models/export/birch-bert-base-kevin self.bert = BertForNextSentencePrediction.from_pretrained( f"Capreolus/birch-bert-large-{config['pretrained']}") if not config["finetune"]: self.bert.requires_grad = False self.bert_context = torch.no_grad else: self.bert_context = contextlib.nullcontext
def test_get_probability_of_next_sentence_multilingual(self): tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased') model = BertForNextSentencePrediction.from_pretrained( 'bert-base-multilingual-cased') wrong_sentence = "How old are you? The Eiffel Tower is in Paris" correct_sentence = "How old are you? I am 22 years old" full, partial = analyse_single_wsc_bert(model, tokenizer, correct_sentence, wrong_sentence) assert full assert partial == 0 full, partial = analyse_single_wsc_bert(model, tokenizer, wrong_sentence, correct_sentence) assert not full assert partial == 0 wrong_sentence = "Quantos anos você tem? A Torre Eiffel fica em Paris" correct_sentence = "Quantos anos você tem? Eu tenho 22 anos" full, partial = analyse_single_wsc_bert(model, tokenizer, correct_sentence, wrong_sentence) assert full assert partial == 0 full, partial = analyse_single_wsc_bert(model, tokenizer, wrong_sentence, correct_sentence) assert not full assert partial == 0
def test_get_probability_of_next_sentence_portuguesee(self): tokenizer = BertTokenizer.from_pretrained( 'models/neuralmind/bert-base-portuguese-cased') model = BertForNextSentencePrediction.from_pretrained( 'models/neuralmind/bert-base-portuguese-cased') wrong_sentence = "Quantos anos você tem? A Torre Eiffel fica em Paris" correct_sentence = "Quantos anos você tem? Eu tenho 22 anos" full, partial = analyse_single_wsc_bert(model, tokenizer, correct_sentence, wrong_sentence) assert full assert partial == 0 full, partial = analyse_single_wsc_bert(model, tokenizer, wrong_sentence, correct_sentence) assert not full assert partial == 0
def main(): path_new = '/home/shuo/Documents/AI_learning/LearningQ/data/teded/teded_crawled_data/' correct_path = '/home/shuo/Documents/AI_learning/LearningQ/code/teded/video_hint/question_corrected.txt' analysis = text_analysis() analysis.read_relation(path_new) analysis.read_videoinfo(path_new) # questions=analysis.gather_question() question = analysis.video_question analysis.read_video_questions_from_JSON(correct_path) # for item in question: # print(question[item]['quizzes'][0].keys()) """ self.video_question[title]: video_link', 'video_title_length', 'video_description', 'quizzes', 'video_youtube_link quizzes: quiz_description', 'question_type', 'quiz_options', 'hint', 'answer' multiple-choices open-ended """ tokenizer = BertTokenizer.from_pretrained('bert-large-cased') model = BertForNextSentencePrediction.from_pretrained('bert-large-cased') scripts = analysis.gather_transcripts(path_new) temp_dic = analysis.build_question_transcripts(path_new) temp_dic = analysis.align_subject(temp_dic) cateloged = gather_subjects(temp_dic) stats = [] for c in cateloged: if len(cateloged[c]) <= 20: temp = cateloged[c] else: temp = random.sample(cateloged[c], 10) print(c + '\n') # for i in range(2, 5): print_stats(temp, model, tokenizer, i, add_answer=False) break
def __init__(self, number_of_sentence, adjust_weight, trained_baseline_model=None, transform=True): super(MultiBERTsModel, self).__init__() self.number_of_sentence = number_of_sentence self.adjust_weight = adjust_weight self.bertNSP = BertForNextSentencePrediction.from_pretrained( 'bert-base-chinese') self.softmax = nn.Softmax(dim=1) #self.linear = nn.Linear(768 * self.number_of_sentence, 1) #self.bert = BertModel.from_pretrained('bert-base-chinese') if trained_baseline_model: self.bert = trained_baseline_model.bert self.sp_linear = trained_baseline_model.linear else: self.bert = BertModel.from_pretrained('bert-base-chinese') self.sp_linear = nn.Linear(768, 1) if transform: self.transform = nn.Linear(768, 768, bias=False)
def predict_next(sen1, sen2, tokenizer, model_path): model = BertForNextSentencePrediction.from_pretrained(model_path) tokenized_sen1 = tokenizer.tokenize(sen1) tokenized_sen2 = tokenizer.tokenize(sen2) tokenized_text = tokenized_sen1 + tokenized_sen2 print(tokenized_text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids_1 = [0] * len(tokenized_sen1) segments_ids_2 = [1] * len(tokenized_sen2) segments_ids = segments_ids_1 + segments_ids_2 segments_tensors = torch.tensor([segments_ids]) print(segments_tensors) with torch.no_grad(): outputs = model(torch.tensor([indexed_tokens]), token_type_ids=segments_tensors) predictions = outputs[0].cpu().numpy() result = predictions[0][1] > predictions[0][0] print(predictions) print(result)
partition = int(sys.argv[1]) from_idx = partition * 1000 to_idx = (partition + 1) * 1000 test_book_ids = test_book_ids[from_idx:to_idx] print(len(test_book_ids), 'books') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') cls, sep = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]"]) device = torch.device("cuda:" + str(partition + 1) if torch.cuda.is_available() else "cpu") print(device) print(torch.cuda.device_count(), "GPUs") model = BertForNextSentencePrediction.from_pretrained(model_dir) model = model.to(device) for book_id in test_book_ids: print(book_id) try: process_book(bert_tok_dir, pred_scores_dir, model, device, cls, sep, book_id) except Exception as e: print(book_id, e) print('Done!')
def _get_next_sentence_prediction(self): """ Initializes the BertForNextSentencePrediction transformer """ self.nsp = BertForNextSentencePrediction.from_pretrained(self.model) self.nsp.eval()
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, BertForNextSentencePrediction, BertTokenizer # import gpt_2_simple as gpt2 tokenizer_qa = AutoTokenizer.from_pretrained( "deepset/bert-large-uncased-whole-word-masking-squad2") model_qa = AutoModelForQuestionAnswering.from_pretrained( "deepset/bert-large-uncased-whole-word-masking-squad2") print("BERT For Q/A downloaded") model_nsp = BertForNextSentencePrediction.from_pretrained('bert-base-cased') tokenizer_nsp = BertTokenizer.from_pretrained('bert-base-cased') print("BERT NSP downloaded") # sess = gpt2.start_tf_sess() # gpt2.load_gpt2(sess, run_name='run1_topical_token') # print("GPT2 loaded")
# model-related parser.add_argument("--model", default="albert-base-v2", type=str, help="Model Name") parser.add_argument("--batch_size", default=2, type=int, help="Training batch size") # data-related parser.add_argument("--data_path", default="./data/augmented_data.csv", type=str) parser.add_argument("--train_size", default=0.5, type=float, help="Training Size (ratio)") args = parser.parse_args() from transformers import BertForNextSentencePrediction, BertTokenizer args.model = BertForNextSentencePrediction.from_pretrained( "bert-base-uncased") args.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") seed() main(args)