def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForNextSentencePrediction(config=config) model.eval() loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels) result = { "loss": loss, "seq_relationship_score": seq_relationship_score, } self.parent.assertListEqual( list(result["seq_relationship_score"].size()), [self.batch_size, 2]) self.check_loss_output(result)
def predictor(sentence1, sentence2): text = "[CLS] " + sentence1 + " [SEP] a" + sentence2 + " [SEP]" ids1 = [0] *(len(tokenizer.tokenize(sentence1)) + 2) ids2 = [1] *(len(tokenizer.tokenize(sentence2)) + 1) ids1.extend(ids2) tokenized_text = tokenizer.tokenize(text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = ids1 tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') model.eval() tokens_tensor = tokens_tensor.to('cuda') segments_tensors = segments_tensors.to('cuda') model.to('cuda') # Predict the next sentence classification logits with torch.no_grad(): next_sent_classif_logits = model(tokens_tensor, segments_tensors) ret = torch.softmax(next_sent_classif_logits[0], dim=1) a.cpu() return a[0][0].item()
def start_inference(data, dialogue_type, dest, batchsize, bert_model, cuda): assert torch.cuda.is_available( ) == True, 'PyTorch not running on GPU! #sadpanda' torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.manual_seed(100) dialogue_type_dict = {'DB': 'db_response_new', 'normal': 'response'} config = BertConfig.from_pretrained(bert_model) tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForNextSentencePrediction(config) model.cuda() model.eval() df = pd.read_csv(data, usecols=['id']) df.dropna(inplace=True) row_count = df.shape[0] del df chunk_count = math.ceil(row_count / batchsize) with open(dest, 'w+'): pass cols = ['context', dialogue_type_dict[dialogue_type]] for i, chunk in enumerate( tqdm(pd.read_csv(open(data, 'r'), usecols=cols, chunksize=batchsize), desc='Batches', total=chunk_count)): samples = get_batch(chunk, dialogue_type_dict[dialogue_type]) assert len(samples) == chunk.shape[0], 'Some samples went missing!' if batchsize == 1: results = convert_single_example_to_features(samples, tokenizer) else: results = convert_examples_to_features(samples, tokenizer) with torch.no_grad(): input_ids = torch.tensor([x.input_ids for x in results]).cuda() token_type_ids = torch.tensor([x.input_type_ids for x in results]).cuda() attention_mask = torch.tensor([x.input_mask for x in results]).cuda() outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0] outputs = torch.softmax(outputs, dim=1) db_probs = outputs[:, 1] with open(dest, 'a') as f: f.write('\n'.join([str(x) for x in db_probs.tolist()]) + '\n')
import torch from pytorch_transformers import BertConfig, BertTokenizer, BertForNextSentencePrediction import numpy as np import pandas as pd config = BertConfig.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForNextSentencePrediction(config) model.eval() model.cuda() df = pd.read_csv('breaker-of-dialogues/validation_db.csv') max_word_count = 550 class SampleType: text_a = '' text_b = None unique_id = 0 def get_batch(df): samples = [] for _, row in df.iterrows(): temp_sample = SampleType() temp_sample.unique_id = row.id temp_sample.text_a = 'hello my name is lionel messi' temp_sample.text_b = 'and I play football' samples.append(temp_sample)
print("文章2を入力してください") sentence2 = input() text = "[CLS] " + sentence1 + " [SEP] " + sentence2 + " [SEP]" ids1 = [0] *(len(tokenizer.tokenize(sentence1)) + 2) ids2 = [1] *(len(tokenizer.tokenize(sentence2)) + 1) ids1.extend(ids2) tokenized_text = tokenizer.tokenize(text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = ids1 tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') model.eval() tokens_tensor = tokens_tensor.to('cuda') segments_tensors = segments_tensors.to('cuda') model.to('cuda') # Predict the next sentence classification logits with torch.no_grad(): next_sent_classif_logits = model(tokens_tensor, segments_tensors) a = torch.softmax(next_sent_classif_logits[0], dim=1) print(torch.softmax(next_sent_classif_logits[0], dim=1)) print(torch.softmax(next_sent_classif_logits[0].cpu(), dim=1)) print(a.cpu()) #a.item()
import torch from pytorch_transformers import BertForNextSentencePrediction from pytorch_transformers import BertTokenizer from torch.nn.functional import cosine_similarity from torch.nn.functional import softmax from torch.nn.utils.rnn import pad_sequence BERT_MODEL_VERSION = 'bert-base-uncased' MAX_SENTENCE_LENGTH = 512 tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_VERSION) model = BertForNextSentencePrediction.from_pretrained( BERT_MODEL_VERSION, output_hidden_states=True, ) model.eval() if torch.cuda.is_available(): model.cuda() def calculate_similarities( query_embedding, document_embeddings, ): return cosine_similarity( query_embedding, document_embeddings, dim=1,
'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] } outputs = model(**inputs) # logger.info(outputs) # logger.info(torch.argmax(outputs[0],dim=-1)) # 二分类,0代表是下一句 if torch.argmax(F.softmax(outputs[0], dim=-1), dim=-1).item() == 0: logger.info([texts_a, texts_b]) logger.info(F.softmax(outputs[0], dim=-1)) return [texts_b, F.softmax(outputs[0], dim=-1)] if __name__ == "__main__": import pickle import pandas as pd tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) model = BertForNextSentencePrediction.from_pretrained('out/') model.to('cuda') break_tag = pd.read_excel( '/data/jh/notebooks/wanglei/1688/data/break_tag.xlsx', header=None) break_tag = list(break_tag[0]) ret = [] for tag in break_tag: text_a = ['露出精致的锁骨和优美的天鹅颈'] text_b = [tag] ret.append(test(text_a, text_b, tokenizer, model))