def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = BertForNextSentencePrediction(config=config)
     model.eval()
     loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
     result = {
         "loss": loss,
         "seq_relationship_score": seq_relationship_score,
     }
     self.parent.assertListEqual(
         list(result["seq_relationship_score"].size()),
         [self.batch_size, 2])
     self.check_loss_output(result)
Пример #2
0
def predictor(sentence1, sentence2):
    text = "[CLS] " + sentence1 + " [SEP] a" + sentence2 + " [SEP]"

    ids1 = [0] *(len(tokenizer.tokenize(sentence1)) + 2)
    ids2 = [1] *(len(tokenizer.tokenize(sentence2)) + 1)
    ids1.extend(ids2)
    
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = ids1
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
    model.eval()

    tokens_tensor = tokens_tensor.to('cuda')
    segments_tensors = segments_tensors.to('cuda')
    model.to('cuda')

    # Predict the next sentence classification logits
    with torch.no_grad():
        next_sent_classif_logits = model(tokens_tensor, segments_tensors)

    ret = torch.softmax(next_sent_classif_logits[0], dim=1)
    a.cpu()
    return a[0][0].item()
Пример #3
0
def start_inference(data, dialogue_type, dest, batchsize, bert_model, cuda):

    assert torch.cuda.is_available(
    ) == True, 'PyTorch not running on GPU! #sadpanda'

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(100)

    dialogue_type_dict = {'DB': 'db_response_new', 'normal': 'response'}

    config = BertConfig.from_pretrained(bert_model)
    tokenizer = BertTokenizer.from_pretrained(bert_model)
    model = BertForNextSentencePrediction(config)
    model.cuda()
    model.eval()

    df = pd.read_csv(data, usecols=['id'])
    df.dropna(inplace=True)
    row_count = df.shape[0]
    del df

    chunk_count = math.ceil(row_count / batchsize)

    with open(dest, 'w+'):
        pass

    cols = ['context', dialogue_type_dict[dialogue_type]]
    for i, chunk in enumerate(
            tqdm(pd.read_csv(open(data, 'r'),
                             usecols=cols,
                             chunksize=batchsize),
                 desc='Batches',
                 total=chunk_count)):
        samples = get_batch(chunk, dialogue_type_dict[dialogue_type])

        assert len(samples) == chunk.shape[0], 'Some samples went missing!'

        if batchsize == 1:
            results = convert_single_example_to_features(samples, tokenizer)
        else:
            results = convert_examples_to_features(samples, tokenizer)

        with torch.no_grad():
            input_ids = torch.tensor([x.input_ids for x in results]).cuda()
            token_type_ids = torch.tensor([x.input_type_ids
                                           for x in results]).cuda()
            attention_mask = torch.tensor([x.input_mask
                                           for x in results]).cuda()

            outputs = model(input_ids,
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask)[0]
            outputs = torch.softmax(outputs, dim=1)
        db_probs = outputs[:, 1]

        with open(dest, 'a') as f:
            f.write('\n'.join([str(x) for x in db_probs.tolist()]) + '\n')
Пример #4
0
import torch
from pytorch_transformers import BertConfig, BertTokenizer, BertForNextSentencePrediction
import numpy as np
import pandas as pd

config = BertConfig.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction(config)
model.eval()
model.cuda()

df = pd.read_csv('breaker-of-dialogues/validation_db.csv')
max_word_count = 550


class SampleType:
    text_a = ''
    text_b = None
    unique_id = 0


def get_batch(df):
    samples = []
    for _, row in df.iterrows():
        temp_sample = SampleType()
        temp_sample.unique_id = row.id

        temp_sample.text_a = 'hello my name is lionel messi'
        temp_sample.text_b = 'and I play football'

        samples.append(temp_sample)
Пример #5
0
print("文章2を入力してください")
sentence2 = input()

text = "[CLS] " + sentence1 + " [SEP] " + sentence2 + " [SEP]"

ids1 = [0] *(len(tokenizer.tokenize(sentence1)) + 2)
ids2 = [1] *(len(tokenizer.tokenize(sentence2)) + 1)
ids1.extend(ids2)
 
tokenized_text = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = ids1
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
 
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
model.eval()

tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda')

# Predict the next sentence classification logits
with torch.no_grad():
    next_sent_classif_logits = model(tokens_tensor, segments_tensors)

a = torch.softmax(next_sent_classif_logits[0], dim=1)
print(torch.softmax(next_sent_classif_logits[0], dim=1))
print(torch.softmax(next_sent_classif_logits[0].cpu(), dim=1))
print(a.cpu())
#a.item()
Пример #6
0
import torch
from pytorch_transformers import BertForNextSentencePrediction
from pytorch_transformers import BertTokenizer
from torch.nn.functional import cosine_similarity
from torch.nn.functional import softmax
from torch.nn.utils.rnn import pad_sequence

BERT_MODEL_VERSION = 'bert-base-uncased'
MAX_SENTENCE_LENGTH = 512

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_VERSION)
model = BertForNextSentencePrediction.from_pretrained(
    BERT_MODEL_VERSION,
    output_hidden_states=True,
)

model.eval()

if torch.cuda.is_available():
    model.cuda()


def calculate_similarities(
    query_embedding,
    document_embeddings,
):

    return cosine_similarity(
        query_embedding,
        document_embeddings,
        dim=1,
Пример #7
0
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2]
            }
            outputs = model(**inputs)
            # logger.info(outputs)
            # logger.info(torch.argmax(outputs[0],dim=-1))
            # 二分类,0代表是下一句
            if torch.argmax(F.softmax(outputs[0], dim=-1), dim=-1).item() == 0:
                logger.info([texts_a, texts_b])
                logger.info(F.softmax(outputs[0], dim=-1))
    return [texts_b, F.softmax(outputs[0], dim=-1)]


if __name__ == "__main__":
    import pickle
    import pandas as pd
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                              do_lower_case=True)
    model = BertForNextSentencePrediction.from_pretrained('out/')
    model.to('cuda')

    break_tag = pd.read_excel(
        '/data/jh/notebooks/wanglei/1688/data/break_tag.xlsx', header=None)
    break_tag = list(break_tag[0])
    ret = []
    for tag in break_tag:
        text_a = ['露出精致的锁骨和优美的天鹅颈']
        text_b = [tag]
        ret.append(test(text_a, text_b, tokenizer, model))