예제 #1
0
def get_sentence_label_array(text, label_list: list,
                             predictor: BertClassificationPredictor):
    '''输入一个句子, 输出 pio 的分类.'''
    ans_list = predictor.predict(text)
    ans_array = np.zeros(len(label_list))
    for ans in ans_list:
        label = ans[0]
        index = label_list.index(label)
        ans_array[index] = ans[1]
    return ans_array
예제 #2
0
 def predict(self, text):
     predictor = BertClassificationPredictor(
         model_path=self.in_dir + '/' + self.model_dir,
         label_path=self.in_dir + '/labels',  # location for labels.csv file
         multi_label=True,
         # model_type='xlnet',
         do_lower_case=True)
     prediction = predictor.predict(str(text))[:7]
     rst_list = []
     for i in range(len(prediction)):
         rst_list.append(" #" + str(prediction[i][0]))
     return rst_list
예제 #3
0
class SentimentAnalyzer(object):
    def __init__(self, model_path, label_path):
        self.predictor = BertClassificationPredictor(
                        model_path=model_path,
                        label_path=label_path, # location for labels.csv file
                        multi_label=False,
                        model_type='bert',
                        do_lower_case=False)
        self.preprocessor = TextPreprocessor()


    def predict_sentiment(self, tweet):
        tweet = self.preprocessor.process(tweet)
        print(tweet)
        prediction = self.predictor.predict(tweet)
        print(prediction)
        for label, confidence in prediction:
            if label == "0" and confidence >= 0.7:
                return "Negative"

            if label == "4" and confidence >= 0.7:
                return "Positive"

        return "Neutral"

    def batch_predict_sentiment(self, tweets):
        processed_tweets = []

        for tweet in tweets:
            processed_tweets.append(self.preprocessor.process(tweet))

        predictions = self.predictor.predict_batch(processed_tweets)
        print(predictions)
        results = []

        for prediction in predictions:
            label_to_prob = dict(prediction)

            if label_to_prob["0"] >= 0.7:
                results.append("Negative")
            elif label_to_prob["4"] >= 0.7:
                results.append("Positive")
            else:
                results.append("Neutral")

        return results
def classify_bert(text, model_path):
    """Classify genre using fast-bert.

    Fast-bert automatically uses GPU if `torch.cuda.is_available() == True`

    Parameters
    -----------
    text : <str or list(str)> for single prediction or multiprediction 
    model_path : <str> must contain labels.csv (I've put one in the uploaded version)
            AND all model files (config.json, pytorch_model.bin, special_tokens_map.json, tokenizer_config.json, vocab.txt)

    Returns
    ---------
    str: if type(text) == str
    list: if type(text) == list or numpy array

    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    predictor = BertClassificationPredictor(
        model_path=model_path,
        label_path=model_path,  # location for labels.csv file
        multi_label=True,
        model_type='bert',
        do_lower_case=False)
    # predictor.to(device)

    if isinstance(text, str):
        # Single prediction
        pred = predictor.predict(text)
        pred = dict(pred)
        # single_prediction = predictor.predict("just get me result for this text")
    elif isinstance(text, list) or isinstance(text, np.ndarray):
        pred = predictor.predict_batch(text)
        # # Batch predictions
        # texts = [
        #     "this is the first text",
        #     "this is the second text"
        #     ]
        for i in range(len(pred)):
            pred[i] = dict(pred[i])

        # multiple_predictions = predictor.predict_batch(texts)
    else:
        raise ValueError("Unexpected type for input argument `text`")
    return pred
예제 #5
0
LABEL_PATH = "label"

predictor = BertClassificationPredictor(
    model_path=MODEL_PATH,
    label_path=LABEL_PATH,  # location for labels.csv file
    multi_label=True,
    model_type='bert',
    do_lower_case=True)

# Single prediction
# single_prediction = predictor.predict("where to get food")
# print(single_prediction)

for i in range(10):
    text = input("Enter: ")
    if len(text) < 2: break
    single_prediction = predictor.predict(text)
    print(single_prediction)
    print()

# # Batch predictions
# texts = [
# 	"this is the first text",
# 	"this is the second text"
# 	]

# multiple_predictions = predictor.predict_batch(texts)

# print(multiple_predictions)
예제 #6
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os, sys
import pandas as pd
from fast_bert.prediction import BertClassificationPredictor
import pickle
import json

MODEL_PATH = 'output/model_out/'

predictor = BertClassificationPredictor(
    model_path=MODEL_PATH,
    label_path='./',  # location for labels.csv file
    multi_label=False,
    model_type='bert',
    do_lower_case=False)

#multi prediction
test_data = pd.read_csv('dev.csv')

x = 0
for item in test_data.text:
    prediction = predictor.predict(item)
    with open('predictions.tsv', 'a') as fp:
        #print(str(test_data.id[x]) + '\t' + prediction[0][0][0] + '\t' + prediction[0][0][1] + '\n')
        fp.write(
            str(test_data.id[x]) + '\t' + prediction[0][0][0] + '\t' +
            prediction[0][0][1] + '\n')
    x = x + 1
from fast_bert.prediction import BertClassificationPredictor
from pathlib import Path

DATA_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/data/')
LABEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/labels/')
MODEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/models/')
LOG_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/logs/')

# location for the pretrained BERT models
BERT_PRETRAINED_PATH = Path(
    '../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12/')

predictor = BertClassificationPredictor(model_path=MODEL_PATH,
                                        pretrained_path=BERT_PRETRAINED_PATH,
                                        label_path=LABEL_PATH,
                                        multi_label=False)

# Single prediction
single_prediction = predictor.predict("just get me result for this text")

# Batch predictions
texts = ["this is the first text", "this is the second text"]

multiple_predictions = predictor.predict(texts)
예제 #8
0
class Interaction():
    def __init__(self, args):
        self.gen_model_type = args['gen_model_type']
        self.gen_model_path = args['gen_model_path'].replace('"', '')
        self.conv_line_path = args['conv_line_path'].replace('"', '')
        self.gen_length = args['length']
        self.temperature = args['temperature']
        self.top_k = args['top_k']
        self.top_p = args['top_p']
        self.stop_token = args['stop_token']
        self.repetition_penalty = args['repetition_penalty']
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        #self.device = torch.device("cpu")
        self.gen_model_type = self.gen_model_type.lower()
        self.lookup = {
            '1': 'Fashion',
            '2': 'Politics',
            '3': 'Books',
            '4': 'Sports',
            '5': 'General Entertainment',
            '6': 'Music',
            '7': 'Science & Technology',
            '8': 'Movie',
            '9': 'General'
        }
        self.topic_cls = BertClassificationPredictor(
            model_path=args['topic_cls_path'].replace('"', ''),
            label_path=args['label_dir'].replace(
                '"', ''),  #sys.argv[2], # directory for labels.csv file
            multi_label=False,
            model_type='bert',
            do_lower_case=True)

        self.entity_ext_model = AutoModelForTokenClassification.from_pretrained(
            "dbmdz/bert-large-cased-finetuned-conll03-english")
        #self.entity_ext_model.to(self.device)
        self.entity_ext_model.to('cpu')
        self.entity_ext_tokenizer = AutoTokenizer.from_pretrained(
            "bert-base-cased")

        if self.gen_model_type == 'dialogpt':
            self.gen_tokenizer = AutoTokenizer.from_pretrained(
                self.gen_model_path)
            self.gen_model = AutoModelWithLMHead.from_pretrained(
                self.gen_model_path)
            self.gen_model.cuda()
            self.gen_model.eval()
        elif self.gen_model_type == 'bart':
            self.gen_model = BARTModel.from_pretrained(
                self.gen_model_path,
                checkpoint_file='checkpoint_best.pt',
                data_name_or_path=self.gen_model_path)
            self.gen_model.cuda()
            self.gen_model.eval()

        self.conv_line = BARTModel.from_pretrained(
            self.conv_line_path,
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path=self.conv_line_path)
        self.conv_line.cuda()
        self.conv_line.eval()
        self.baseline_tokenizer = AutoTokenizer.from_pretrained(
            args['baseline'])
        self.baseline_model = AutoModelForCausalLM.from_pretrained(
            args['baseline'])
        #self.baseline_model.to('cpu')
        self.baseline_model.cuda()
        self.baseline_model.eval()

    def baseline_decode(self, user_utt):
        print('baseline decode')
        print(user_utt)
        #new_user_input_ids = self.baseline_tokenizer.encode(
        #    user_utt + self.baseline_tokenizer.eos_token, return_tensors='pt', max_length=128).to('cpu')
        new_user_input_ids = self.baseline_tokenizer.encode(
            user_utt + self.baseline_tokenizer.eos_token,
            return_tensors='pt',
            max_length=128).cuda()
        if user_utt == "BEGIN":
            np.random.seed(random.randint(0, 120))
            torch.manual_seed(random.randint(0, 120))
            chat_history_ids = self.baseline_model.generate(
                new_user_input_ids,
                max_length=60,
                top_k=10,
                top_p=0.70,
                pad_token_id=self.baseline_tokenizer.eos_token_id)
            utterance = self.baseline_tokenizer.decode(
                chat_history_ids[0], skip_special_tokens=True)
            utterance = utterance.replace('BEGIN', '').strip()
            if ' <EOT> ' in utterance:
                print('utterance', utterance)
                utterance = utterance.split(' <EOT> ')[0]
        else:
            np.random.seed(4)
            torch.manual_seed(4)
            chat_history_ids = self.baseline_model.generate(
                new_user_input_ids,
                max_length=60,
                top_k=10,
                top_p=0.70,
                pad_token_id=self.baseline_tokenizer.eos_token_id)
            utterance = self.baseline_tokenizer.decode(
                chat_history_ids[0], skip_special_tokens=True)
            if ' <EOT> ' in utterance:
                print('utterance', utterance)
                utterance = utterance.split(' <EOT> ')[1].strip()

        return utterance

    def get_topic(self, utterance):
        '''
        this method calls the topic cls and returns utterace's topic
        '''
        print('topic input:', utterance)
        topic = self.lookup[self.topic_cls.predict(utterance)[0][0]]
        print('predict topic:', topic)
        return topic

    def get_entities(self, utterance):
        '''
        this method calls the entity extractor model and returns utterace's entities
        '''
        entities = ''

        label_list = [
            "O",  # Outside of a named entity
            "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
            "I-MISC",  # Miscellaneous entity
            "B-PER",  # Beginning of a person's name right after another person's name
            "I-PER",  # Person's name
            "B-ORG",  # Beginning of an organisation right after another organisation
            "I-ORG",  # Organisation
            "B-LOC",  # Beginning of a location right after another location
            "I-LOC"  # Location
        ]

        # Bit of a hack to get the tokens with the special tokens
        tokens = self.entity_ext_tokenizer.tokenize(
            self.entity_ext_tokenizer.decode(
                self.entity_ext_tokenizer.encode(utterance)))
        inputs = self.entity_ext_tokenizer.encode(
            utterance, return_tensors="pt").to('cpu')

        outputs = self.entity_ext_model(inputs)[0]
        predictions = torch.argmax(outputs, dim=2)

        entity = [(token, label_list[prediction])
                  for token, prediction in zip(tokens, predictions[0].tolist())
                  ]

        # delete '##' before tokens
        r = []
        r_tags = []
        for i, tpl in enumerate(entity):
            if tpl[0].startswith("##"):
                if r:
                    r[-1] += tpl[0][2:]
            else:
                r.append(tpl[0])
                r_tags.append(tpl[1])

        new_entity_token = [(i, j) for i, j in zip(r, r_tags)]

        # combine tokens into entities
        flag = False
        entities = []
        ent_tags = []
        for i, tpl in enumerate(new_entity_token):
            if tpl[1] == "O":
                flag = False
                continue
            elif tpl[1] == "I-MISC" or tpl[1] == "I-PER" or tpl[
                    1] == "I-ORG" or tpl[1] == "I-LOC":
                if flag == False:
                    flag = True
                    entities.append(tpl[0])
                    ent_tags.append(tpl[1])
                else:
                    entities[-1] += ' '
                    entities[-1] += tpl[0]
            elif tpl[1] == "B-MISC" or tpl[1] == "B-PER" or tpl[
                    1] == "B-ORG" or tpl[1] == "B-LOC":
                entities.append(tpl[0])
                ent_tags.append(tpl[1])

        return entities

    def get_response_keywords(self,
                              utterance,
                              topic,
                              entities,
                              randomness=False):
        '''
        this method calls the conv_line model and returns response keywords 
        '''
        entities_comb = ' # '.join(entities)
        input_conv = topic + ' <EOT> ' + utterance + ' <A0> ' + entities_comb + '<A1>'
        '''
        this method calls the conv_line model and returns response keywords 
        '''
        print('input to conv_line')
        print(input_conv)
        if randomness == False:
            np.random.seed(4)
            torch.manual_seed(4)
        elif randomness == True:
            np.random.seed(random.randint(0, 120))
            torch.manual_seed(random.randint(0, 120))
        maxb = 30  #Can be customized
        minb = 7  #Can be customized
        response = ''
        slines = [input_conv]
        with torch.no_grad():
            #hypotheses = self.conv_line.sample(slines, beam=4, lenpen=2.0, no_repeat_ngram_size=3)
            hypotheses = self.conv_line.sample(slines,
                                               sampling=True,
                                               sampling_topk=5,
                                               temperature=0.7,
                                               lenpen=2.0,
                                               max_len_b=maxb,
                                               min_len=minb,
                                               no_repeat_ngram_size=3)
        hypotheses = hypotheses[0]
        print('keywords hypotheses:', hypotheses)
        response = hypotheses.replace('\n', '')
        keywords = response.replace('<V>', '').replace('<s>', '').split('#')
        print('keywords keywords:', keywords)
        k = []
        for keyword in keywords:
            keyword = keyword.strip()
            k.append(keyword)
        print('keywords k:', k)
        keywords = k

        return keywords

    def top_k_top_p_filtering(self, logits, filter_value=-float('Inf')):
        """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
            Args:
                logits: logits distribution shape (batch size x vocabulary size)
                top_k > 0: keep only top k tokens with highest probability (top-k filtering).
                top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                    Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
            From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
        """
        top_k = min(self.top_k, logits.size(-1))  # Safety check
        if top_k > 0:
            # Remove all tokens with a probability less than the last token of the top-k
            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
                                                                      None]
            logits[indices_to_remove] = filter_value

        if self.top_p > 0.0:
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1),
                                            dim=-1)

            # Remove tokens with cumulative probability above the threshold
            sorted_indices_to_remove = cumulative_probs > self.top_p
            # Shift the indices to the right to keep also the first token above the threshold
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                ..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0

            # scatter sorted tensors to original indexing
            indices_to_remove = sorted_indices_to_remove.scatter(
                dim=1, index=sorted_indices, src=sorted_indices_to_remove)
            logits[indices_to_remove] = filter_value
        return logits

    def sample_sequence(self, model, context):
        context = torch.tensor(context, dtype=torch.long, device=self.device)
        context = context.unsqueeze(0).repeat(1, 1)
        generated = context
        model.cuda()
        with torch.no_grad():
            for _ in trange(self.gen_length):
                inputs = {'input_ids': generated}
                outputs = model(
                    **inputs
                )  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
                next_token_logits = outputs[0][:, -1, :] / (
                    self.temperature if self.temperature > 0 else 1.)

                # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
                for i in range(1):
                    for _ in set(generated[i].tolist()):
                        next_token_logits[i, _] /= self.repetition_penalty

                filtered_logits = self.top_k_top_p_filtering(next_token_logits)
                next_token = torch.multinomial(F.softmax(filtered_logits,
                                                         dim=-1),
                                               num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)
        return generated

    def get_response(self, user_utterance, user_utt_topic, res_keywords):
        '''
        this method calls the dial_gen model and returns generated utterance 
        '''
        res_keywords = ' # '.join(res_keywords)
        input_dial_gen = user_utt_topic.strip(
        ) + ' <EOT> ' + user_utterance.strip() + ' <V> ' + res_keywords.strip(
        )
        print('input to dialog generation module')
        print(input_dial_gen)
        if self.gen_model_type == 'dialogpt':
            context_tokens = self.gen_tokenizer.encode(
                input_dial_gen, add_special_tokens=False)
            out = self.sample_sequence(model=self.gen_model,
                                       context=context_tokens)
            out = out[:, len(context_tokens):].tolist()
            response = self.gen_tokenizer.decode(
                out[0], clean_up_tokenization_spaces=True)
            response = response[:response.find('\n') if self.
                                stop_token else None]
        elif self.gen_model_type == 'bart':
            np.random.seed(4)
            torch.manual_seed(4)
            maxb = 128  #Can be customized
            minb = 15  #Can be customized
            response = ''
            slines = [input_dial_gen]
            with torch.no_grad():
                hypotheses = self.gen_model.sample(
                    slines,
                    sampling=True,
                    sampling_topk=self.top_k,
                    temperature=self.temperature,
                    lenpen=2.0,
                    max_len_b=maxb,
                    min_len=minb,
                    no_repeat_ngram_size=3)
            hypotheses = hypotheses[0]
            response = hypotheses.replace('\n', '')
        return response