示例#1
0
def start_inference(data, dialogue_type, dest, batchsize, bert_model, cuda):

    assert torch.cuda.is_available(
    ) == True, 'PyTorch not running on GPU! #sadpanda'

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(100)

    dialogue_type_dict = {'DB': 'db_response_new', 'normal': 'response'}

    config = BertConfig.from_pretrained(bert_model)
    tokenizer = BertTokenizer.from_pretrained(bert_model)
    model = BertForNextSentencePrediction(config)
    model.cuda()
    model.eval()

    df = pd.read_csv(data, usecols=['id'])
    df.dropna(inplace=True)
    row_count = df.shape[0]
    del df

    chunk_count = math.ceil(row_count / batchsize)

    with open(dest, 'w+'):
        pass

    cols = ['context', dialogue_type_dict[dialogue_type]]
    for i, chunk in enumerate(
            tqdm(pd.read_csv(open(data, 'r'),
                             usecols=cols,
                             chunksize=batchsize),
                 desc='Batches',
                 total=chunk_count)):
        samples = get_batch(chunk, dialogue_type_dict[dialogue_type])

        assert len(samples) == chunk.shape[0], 'Some samples went missing!'

        if batchsize == 1:
            results = convert_single_example_to_features(samples, tokenizer)
        else:
            results = convert_examples_to_features(samples, tokenizer)

        with torch.no_grad():
            input_ids = torch.tensor([x.input_ids for x in results]).cuda()
            token_type_ids = torch.tensor([x.input_type_ids
                                           for x in results]).cuda()
            attention_mask = torch.tensor([x.input_mask
                                           for x in results]).cuda()

            outputs = model(input_ids,
                            token_type_ids=token_type_ids,
                            attention_mask=attention_mask)[0]
            outputs = torch.softmax(outputs, dim=1)
        db_probs = outputs[:, 1]

        with open(dest, 'a') as f:
            f.write('\n'.join([str(x) for x in db_probs.tolist()]) + '\n')
 def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = BertForNextSentencePrediction(config=config)
     model.eval()
     loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
     result = {
         "loss": loss,
         "seq_relationship_score": seq_relationship_score,
     }
     self.parent.assertListEqual(
         list(result["seq_relationship_score"].size()),
         [self.batch_size, 2])
     self.check_loss_output(result)
示例#3
0
import torch
from pytorch_transformers import BertConfig, BertTokenizer, BertForNextSentencePrediction
import numpy as np
import pandas as pd

config = BertConfig.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction(config)
model.eval()
model.cuda()

df = pd.read_csv('breaker-of-dialogues/validation_db.csv')
max_word_count = 550


class SampleType:
    text_a = ''
    text_b = None
    unique_id = 0


def get_batch(df):
    samples = []
    for _, row in df.iterrows():
        temp_sample = SampleType()
        temp_sample.unique_id = row.id

        temp_sample.text_a = 'hello my name is lionel messi'
        temp_sample.text_b = 'and I play football'

        samples.append(temp_sample)