Python BertClassificationPredictor 예제들, fast_bert.prediction.BertClassificationPredictor Python 예제들

예제 #1

0

파일 보기

파일: main.py 프로젝트: mrquiroz/bert-espa-ol

from flask import Flask
import pandas as pd
from fast_bert.prediction import BertClassificationPredictor
from flask import Flask, jsonify, request
import re

app = Flask(__name__)
app.config.from_object(__name__)

MODEL_PATH = 'model/'

predictor = BertClassificationPredictor(model_path=MODEL_PATH,
                                        label_path='',
                                        multi_label=True,
                                        use_fast_tokenizer=False,
                                        model_type='bert',
                                        do_lower_case=False)


@app.route('/predict', methods=['POST'])
def predict():
    if request.method == 'POST':
        texto = request.data
        s = re.sub('\W+', ' ', texto.decode('ASCII'))
        respuesta = predictor.predict(s.split('bertmedicalstring ')[1])
        return jsonify({
            'Clase1': respuesta[0][0],
            'Puntaje1': respuesta[0][1],
            'Clase2': respuesta[1][0],
            'Puntaje2': respuesta[1][1]
        })

예제 #2

0

파일 보기

파일: deploy.py 프로젝트: jeffreypaul15/reddit_flair_prediction

from fast_bert.prediction import BertClassificationPredictor
import praw
from flask import Flask, render_template
from flask import request
import json
app = Flask(__name__)

MODEL_PATH = 'model_out'  # location for model_out folder

predictor = BertClassificationPredictor(
    model_path=MODEL_PATH,
    label_path='',  # location for labels.csv file
    multi_label=False,
    model_type='bert',
    do_lower_case=False)

reddit = praw.Reddit(client_id='LQateSKqomx_7A',
                     client_secret='7PIXWoFKM8HZQ7RRNGo5p1ka18s',
                     password='******',
                     user_agent='reddit',
                     username='******')


@app.route('/')
def hello_world():
    return render_template("index.html")


@app.route('/automated_testing', methods=['GET', 'POST'])
def test_file():

예제 #3

0

파일 보기

파일: predict.py 프로젝트: LaverdeS/Introduction_NLP

    parser.add_argument('--file_out', type=str, default='')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = get_args()
    print(args.file_in)


    OUTPUT_DIR = './output/'
    MODEL_PATH = OUTPUT_DIR + args.model_path+'/model_out'
    LABEL_PATH = './'

    predictor = BertClassificationPredictor(
        model_path=MODEL_PATH,
        label_path=LABEL_PATH,  # location for labels.csv file
        multi_label=False,
        model_type='bert',
        do_lower_case=False)

    df = pd.read_csv(LABEL_PATH + args.file_in, encoding='utf-8')
    texts = df.text.tolist()
    multiple_predictions = predictor.predict_batch(texts)
    # print(multiple_predictions)
    # print(type(multiple_predictions))

    with open('./predictions/'+args.file_out, 'w') as filehandle:
        for listitem in multiple_predictions:
            filehandle.write('%s\n' % listitem)

예제 #4

0

파일 보기

파일: extract_doc_scores.py 프로젝트: dpappas/pytorch_pacrr_and_posit_drmm

from fast_bert.prediction import BertClassificationPredictor
from pathlib import Path

DATA_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/data/')
LABEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/labels/')
MODEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/models/')
LOG_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/logs/')

# location for the pretrained BERT models
BERT_PRETRAINED_PATH = Path(
    '../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12/')

predictor = BertClassificationPredictor(model_path=MODEL_PATH,
                                        pretrained_path=BERT_PRETRAINED_PATH,
                                        label_path=LABEL_PATH,
                                        multi_label=False)

# Single prediction
single_prediction = predictor.predict("just get me result for this text")

# Batch predictions
texts = ["this is the first text", "this is the second text"]

multiple_predictions = predictor.predict(texts)

예제 #5

0

파일 보기

파일: test_report.py 프로젝트: LaverdeS/Introduction_NLP

        'AVERAGE_2': {
            'precission': 0.0,
            'recall': 0.0,
            'f1': 0.0
        },
        'micro': {
            'precission': 0.0,
            'recall': 0.0,
            'f1': 0.0
        }
        #'micro_2': {'precission': 0.0, 'recall': 0.0, 'f1': 0.0}
    }

    predictor = BertClassificationPredictor(model_path=MODEL_PATH,
                                            label_path=LABEL_PATH,
                                            multi_label=False,
                                            model_type='bert',
                                            do_lower_case=False)

    if args.file_in[-3:] == 'csv':
        df_in = pd.read_csv(args.file_in, encoding='utf-8')
        truth_file = args.truth
        generate_metrics_report(df_in, truth_file, name=args.model_path)
    else:
        *_, truth_files = list(next(os.walk(args.truth)))
        *_, test_files = list(next(os.walk(args.file_in)))
        print(f'lenght of truth_files: {len(truth_files)}')
        print(f'lenght of test_files: {len(test_files)}')
        print(f'tests: {test_files}, \ntruth: {truth_files}')

        TEST_REPORT = pd.DataFrame()

예제 #6

0

파일 보기

# 获取 gpu 的数目
device = torch.device('cuda')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

# 设定索要的标签
label_cols = (sentence_labels
              if train_for == 'sentence' else fine_grained_labels)

# 开始构建预测模型
predictor = BertClassificationPredictor(model_path=args.output_dir /
                                        'model_out',
                                        label_path=LABEL_PATH,
                                        multi_label=True,
                                        model_type=args.model_type,
                                        do_lower_case=True)

# 获取测试数据
output = predictor.predict_batch(
    list(
        pd.read_csv(str(
            DATA_PATH.joinpath('test.csv').absolute()))['text'].values))

# 将预测结果输出
pd.DataFrame(output).to_csv(
    str(DATA_PATH.joinpath('output_bert.csv').absolute()))

# 预测结果读入
results = pd.read_csv(str(DATA_PATH.joinpath('output_bert.csv').absolute()))

예제 #7

0

파일 보기

                    default=True)
parser.add_argument(
    '--dataset',
    type=str,
    help=
    'which dataset is used in Alexa topical dataset for testing, options can be train, valid_rare, valida_freq, test_freq, test_rare',
    required=True,
    choices=['train', 'valid_rare', 'valid_freq', 'test_freq', 'test_rare'])
args = parser.parse_args()
MODEL_DIR = args.model_dir  #sys.argv[1]

MODEL_PATH = path.join(MODEL_DIR, 'model_out')

predictor = BertClassificationPredictor(
    model_path=MODEL_PATH,
    label_path=args.label_dir,  #sys.argv[2], # directory for labels.csv file
    multi_label=False,
    model_type='bert',
    do_lower_case=True)
INPUT = os.path.join('data', args.input_name + '.csv')
texts = list(csv.reader(open(INPUT, 'rt')))  # sys.argv[3]
batchsize = args.batch_size
multiple_predictions = []
for i in tqdm(range(1, len(texts), batchsize)):
    batch_texts = []
    if i + batchsize > len(texts):
        for j in range(i, len(texts)):
            batch_texts.append(texts[j][0])
        tmp_pred = predictor.predict_batch(batch_texts)
        multiple_predictions.extend(tmp_pred)
    else:
        for j in range(i, i + batchsize):

예제 #8

0

파일 보기

def emotion_evaluation(path, arc_path=None, binarized=True, method=None):
    """
    for test after finishing training
    """

    #load emotion classifier
    LABEL_PATH = "emotion_classifier/"
    MODEL_PATH = "emotion_classifier/checkpoint/bert/model_out/"

    predictor = BertClassificationPredictor(
        model_path=MODEL_PATH,
        label_path=LABEL_PATH,  # location for labels.csv file
        multi_label=True,
        model_type='bert',
        do_lower_case=True)

    # load and process generated file[]
    if os.path.exists("numpy_files_v3/generated_em_dist_rl_fine.npy"):
        print("Loading computed emotion dist for generated stories...")
        generated_emotion_scores = np.load(
            "numpy_files_v3/generated_em_dist_rl_fine.npy")
    else:
        print("Start loading and processing generated stories...")
        _all_text = []
        with open(path) as tsvfile:
            reader = csv.reader(tsvfile, delimiter='\t')
            for row in reader:
                # trim prefix context and suffix EOS
                txt = row[1].strip(" | ")
                ind = txt.find(" <|endoftext|>")
                txt = txt[:ind] if ind != -1 else txt
                _all_text.append(txt)

        clf_input = []
        comet_input = []
        for txt in _all_text:
            sample_story = nltk.sent_tokenize(txt)  #should be list of len 5
            comet_input.append(sample_story)
            # for some reason the model rarely generates not exactly 5 sentences
            if len(sample_story) == 0:
                sample_story = ["", "", ""]
            elif len(sample_story) > 5:
                sample_story = [
                    sample_story[0],
                    ' '.join(sample_story[j] for j in range(1, 4)),
                    sample_story[4]
                ]
            elif len(sample_story) > 1:
                sample_story = [
                    sample_story[0], ' '.join(j for j in sample_story[1:-1]),
                    sample_story[-1]
                ]
            else:
                sample_story = [
                    sample_story[0], sample_story[0], sample_story[0]
                ]

            clf_input.append(sample_story[:5])

        print("Start classifying generated stories...")
        generated_emotion_scores = get_emotion_dist(
            predictor, clf_input,
            preprint=False)  # np array (data_size, 3 * 5)
        np.save("numpy_files_v3/generated_em_dist_rl_base_k40.npy",
                generated_emotion_scores)
        print("Classification finished !")

    if arc_path is not None:
        test_arc = [i.strip().split() for i in open(arc_path)]
        print("Start computing emotion probability score")
        emo_prob_score = get_emotion_prob(generated_emotion_scores,
                                          test_arc,
                                          batch_normalize=True)
        print("clf_prob score: ", emo_prob_score)
        metrics.update({"classifier probablity score: ": emo_prob_score})

    if binarized:
        data_size = len(test_arc)
        generated_emotion_scores = np.reshape(generated_emotion_scores,
                                              (data_size, 3, -1))

        generated_emotion_scores_bn = (generated_emotion_scores.max(
            axis=-1, keepdims=1) == generated_emotion_scores).astype(float)

        if os.path.exists(arc_path[:-4] + ".npy"):
            true_emotion_scores_bn = np.load(arc_path[:-4] + ".npy")
        else:
            true_emotion_scores_bn = np.zeros_like(generated_emotion_scores)
            assert (generated_emotion_scores.shape[:2] == (len(test_arc),
                                                           len(test_arc[0])))
            for i in range(true_emotion_scores_bn.shape[0]):
                for j in range(true_emotion_scores_bn.shape[1]):
                    true_emotion_scores_bn[i][j][EMOTION_MAP[test_arc[i]
                                                             [j]]] = 1.0

            np.save(arc_path[:-4] + ".npy", true_emotion_scores_bn)

        arc_emotion_accuracy, seg_emotion_accuracy = compute_emotion_accuracy(
            generated_emotion_scores_bn, true_emotion_scores_bn)
        print("arc_emotion_accuracy: {}\n segment_emotion_accuracy: {}".format(
            arc_emotion_accuracy, seg_emotion_accuracy))
        metrics.update({
            "arc_acc": arc_emotion_accuracy,
            "segment_acc": seg_emotion_accuracy
        })

        dic_dir = os.path.dirname(label_path)
        per_arc_accuracy = compute_per_arc_accuracy(
            generated_emotion_scores_bn, true_emotion_scores_bn, dic_dir)
        metrics.update(per_arc_accuracy)

    # compute comet-based emotion evaluation metric (Ec-Em)
    if arc_path is not None:
        test_arc_file = [i.strip().split() for i in open(arc_path)]
        print("Start generating comet inferences ...")
        comet_prediction = get_comet_prediction(comet_input)
        print("Finished generating comet inferences ...")
        comet_score = compute_edit_distance(comet_prediction,
                                            test_arc_file,
                                            batch_normalize=True)
        print("comet score: {}".format(comet_score))
        metrics.update({"comet_score: ": comet_score})

    return metrics

예제 #9

0

파일 보기

    #     if eos_token_id is not None:
    #         input_ids[i] = input_ids[i] + [eos_token_id]

    length = [len(ids) for ids in input_ids]
    max_length = max(length)
    for i in range(len(input_ids)):
        while len(input_ids[i]) < max_length:
            input_ids[i].append(eos_token_id)

    return np.array(input_ids), np.array(length)


# load trained emotion classifier
predictor = BertClassificationPredictor(
    model_path=FLAGS.clf_output_dir,
    label_path=FLAGS.clf_label_dir,  # location for labels.csv file
    multi_label=True,
    model_type='bert',
    do_lower_case=True)


def main(_):
    """
    Builds the model and runs
    """
    if FLAGS.distributed:
        import horovod.tensorflow as hvd
        hvd.init()

    tf.logging.set_verbosity(tf.logging.INFO)

    if len(config_train.name) > 0:

예제 #10

0

파일 보기

    def __init__(self, args):
        self.gen_model_type = args['gen_model_type']
        self.gen_model_path = args['gen_model_path'].replace('"', '')
        self.conv_line_path = args['conv_line_path'].replace('"', '')
        self.gen_length = args['length']
        self.temperature = args['temperature']
        self.top_k = args['top_k']
        self.top_p = args['top_p']
        self.stop_token = args['stop_token']
        self.repetition_penalty = args['repetition_penalty']
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        #self.device = torch.device("cpu")
        self.gen_model_type = self.gen_model_type.lower()
        self.lookup = {
            '1': 'Fashion',
            '2': 'Politics',
            '3': 'Books',
            '4': 'Sports',
            '5': 'General Entertainment',
            '6': 'Music',
            '7': 'Science & Technology',
            '8': 'Movie',
            '9': 'General'
        }
        self.topic_cls = BertClassificationPredictor(
            model_path=args['topic_cls_path'].replace('"', ''),
            label_path=args['label_dir'].replace(
                '"', ''),  #sys.argv[2], # directory for labels.csv file
            multi_label=False,
            model_type='bert',
            do_lower_case=True)

        self.entity_ext_model = AutoModelForTokenClassification.from_pretrained(
            "dbmdz/bert-large-cased-finetuned-conll03-english")
        #self.entity_ext_model.to(self.device)
        self.entity_ext_model.to('cpu')
        self.entity_ext_tokenizer = AutoTokenizer.from_pretrained(
            "bert-base-cased")

        if self.gen_model_type == 'dialogpt':
            self.gen_tokenizer = AutoTokenizer.from_pretrained(
                self.gen_model_path)
            self.gen_model = AutoModelWithLMHead.from_pretrained(
                self.gen_model_path)
            self.gen_model.cuda()
            self.gen_model.eval()
        elif self.gen_model_type == 'bart':
            self.gen_model = BARTModel.from_pretrained(
                self.gen_model_path,
                checkpoint_file='checkpoint_best.pt',
                data_name_or_path=self.gen_model_path)
            self.gen_model.cuda()
            self.gen_model.eval()

        self.conv_line = BARTModel.from_pretrained(
            self.conv_line_path,
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path=self.conv_line_path)
        self.conv_line.cuda()
        self.conv_line.eval()
        self.baseline_tokenizer = AutoTokenizer.from_pretrained(
            args['baseline'])
        self.baseline_model = AutoModelForCausalLM.from_pretrained(
            args['baseline'])
        #self.baseline_model.to('cpu')
        self.baseline_model.cuda()
        self.baseline_model.eval()

예제 #11

0

파일 보기

class Interaction():
    def __init__(self, args):
        self.gen_model_type = args['gen_model_type']
        self.gen_model_path = args['gen_model_path'].replace('"', '')
        self.conv_line_path = args['conv_line_path'].replace('"', '')
        self.gen_length = args['length']
        self.temperature = args['temperature']
        self.top_k = args['top_k']
        self.top_p = args['top_p']
        self.stop_token = args['stop_token']
        self.repetition_penalty = args['repetition_penalty']
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        #self.device = torch.device("cpu")
        self.gen_model_type = self.gen_model_type.lower()
        self.lookup = {
            '1': 'Fashion',
            '2': 'Politics',
            '3': 'Books',
            '4': 'Sports',
            '5': 'General Entertainment',
            '6': 'Music',
            '7': 'Science & Technology',
            '8': 'Movie',
            '9': 'General'
        }
        self.topic_cls = BertClassificationPredictor(
            model_path=args['topic_cls_path'].replace('"', ''),
            label_path=args['label_dir'].replace(
                '"', ''),  #sys.argv[2], # directory for labels.csv file
            multi_label=False,
            model_type='bert',
            do_lower_case=True)

        self.entity_ext_model = AutoModelForTokenClassification.from_pretrained(
            "dbmdz/bert-large-cased-finetuned-conll03-english")
        #self.entity_ext_model.to(self.device)
        self.entity_ext_model.to('cpu')
        self.entity_ext_tokenizer = AutoTokenizer.from_pretrained(
            "bert-base-cased")

        if self.gen_model_type == 'dialogpt':
            self.gen_tokenizer = AutoTokenizer.from_pretrained(
                self.gen_model_path)
            self.gen_model = AutoModelWithLMHead.from_pretrained(
                self.gen_model_path)
            self.gen_model.cuda()
            self.gen_model.eval()
        elif self.gen_model_type == 'bart':
            self.gen_model = BARTModel.from_pretrained(
                self.gen_model_path,
                checkpoint_file='checkpoint_best.pt',
                data_name_or_path=self.gen_model_path)
            self.gen_model.cuda()
            self.gen_model.eval()

        self.conv_line = BARTModel.from_pretrained(
            self.conv_line_path,
            checkpoint_file='checkpoint_best.pt',
            data_name_or_path=self.conv_line_path)
        self.conv_line.cuda()
        self.conv_line.eval()
        self.baseline_tokenizer = AutoTokenizer.from_pretrained(
            args['baseline'])
        self.baseline_model = AutoModelForCausalLM.from_pretrained(
            args['baseline'])
        #self.baseline_model.to('cpu')
        self.baseline_model.cuda()
        self.baseline_model.eval()

    def baseline_decode(self, user_utt):
        print('baseline decode')
        print(user_utt)
        #new_user_input_ids = self.baseline_tokenizer.encode(
        #    user_utt + self.baseline_tokenizer.eos_token, return_tensors='pt', max_length=128).to('cpu')
        new_user_input_ids = self.baseline_tokenizer.encode(
            user_utt + self.baseline_tokenizer.eos_token,
            return_tensors='pt',
            max_length=128).cuda()
        if user_utt == "BEGIN":
            np.random.seed(random.randint(0, 120))
            torch.manual_seed(random.randint(0, 120))
            chat_history_ids = self.baseline_model.generate(
                new_user_input_ids,
                max_length=60,
                top_k=10,
                top_p=0.70,
                pad_token_id=self.baseline_tokenizer.eos_token_id)
            utterance = self.baseline_tokenizer.decode(
                chat_history_ids[0], skip_special_tokens=True)
            utterance = utterance.replace('BEGIN', '').strip()
            if ' <EOT> ' in utterance:
                print('utterance', utterance)
                utterance = utterance.split(' <EOT> ')[0]
        else:
            np.random.seed(4)
            torch.manual_seed(4)
            chat_history_ids = self.baseline_model.generate(
                new_user_input_ids,
                max_length=60,
                top_k=10,
                top_p=0.70,
                pad_token_id=self.baseline_tokenizer.eos_token_id)
            utterance = self.baseline_tokenizer.decode(
                chat_history_ids[0], skip_special_tokens=True)
            if ' <EOT> ' in utterance:
                print('utterance', utterance)
                utterance = utterance.split(' <EOT> ')[1].strip()

        return utterance

    def get_topic(self, utterance):
        '''
        this method calls the topic cls and returns utterace's topic
        '''
        print('topic input:', utterance)
        topic = self.lookup[self.topic_cls.predict(utterance)[0][0]]
        print('predict topic:', topic)
        return topic

    def get_entities(self, utterance):
        '''
        this method calls the entity extractor model and returns utterace's entities
        '''
        entities = ''

        label_list = [
            "O",  # Outside of a named entity
            "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
            "I-MISC",  # Miscellaneous entity
            "B-PER",  # Beginning of a person's name right after another person's name
            "I-PER",  # Person's name
            "B-ORG",  # Beginning of an organisation right after another organisation
            "I-ORG",  # Organisation
            "B-LOC",  # Beginning of a location right after another location
            "I-LOC"  # Location
        ]

        # Bit of a hack to get the tokens with the special tokens
        tokens = self.entity_ext_tokenizer.tokenize(
            self.entity_ext_tokenizer.decode(
                self.entity_ext_tokenizer.encode(utterance)))
        inputs = self.entity_ext_tokenizer.encode(
            utterance, return_tensors="pt").to('cpu')

        outputs = self.entity_ext_model(inputs)[0]
        predictions = torch.argmax(outputs, dim=2)

        entity = [(token, label_list[prediction])
                  for token, prediction in zip(tokens, predictions[0].tolist())
                  ]

        # delete '##' before tokens
        r = []
        r_tags = []
        for i, tpl in enumerate(entity):
            if tpl[0].startswith("##"):
                if r:
                    r[-1] += tpl[0][2:]
            else:
                r.append(tpl[0])
                r_tags.append(tpl[1])

        new_entity_token = [(i, j) for i, j in zip(r, r_tags)]

        # combine tokens into entities
        flag = False
        entities = []
        ent_tags = []
        for i, tpl in enumerate(new_entity_token):
            if tpl[1] == "O":
                flag = False
                continue
            elif tpl[1] == "I-MISC" or tpl[1] == "I-PER" or tpl[
                    1] == "I-ORG" or tpl[1] == "I-LOC":
                if flag == False:
                    flag = True
                    entities.append(tpl[0])
                    ent_tags.append(tpl[1])
                else:
                    entities[-1] += ' '
                    entities[-1] += tpl[0]
            elif tpl[1] == "B-MISC" or tpl[1] == "B-PER" or tpl[
                    1] == "B-ORG" or tpl[1] == "B-LOC":
                entities.append(tpl[0])
                ent_tags.append(tpl[1])

        return entities

    def get_response_keywords(self,
                              utterance,
                              topic,
                              entities,
                              randomness=False):
        '''
        this method calls the conv_line model and returns response keywords 
        '''
        entities_comb = ' # '.join(entities)
        input_conv = topic + ' <EOT> ' + utterance + ' <A0> ' + entities_comb + '<A1>'
        '''
        this method calls the conv_line model and returns response keywords 
        '''
        print('input to conv_line')
        print(input_conv)
        if randomness == False:
            np.random.seed(4)
            torch.manual_seed(4)
        elif randomness == True:
            np.random.seed(random.randint(0, 120))
            torch.manual_seed(random.randint(0, 120))
        maxb = 30  #Can be customized
        minb = 7  #Can be customized
        response = ''
        slines = [input_conv]
        with torch.no_grad():
            #hypotheses = self.conv_line.sample(slines, beam=4, lenpen=2.0, no_repeat_ngram_size=3)
            hypotheses = self.conv_line.sample(slines,
                                               sampling=True,
                                               sampling_topk=5,
                                               temperature=0.7,
                                               lenpen=2.0,
                                               max_len_b=maxb,
                                               min_len=minb,
                                               no_repeat_ngram_size=3)
        hypotheses = hypotheses[0]
        print('keywords hypotheses:', hypotheses)
        response = hypotheses.replace('\n', '')
        keywords = response.replace('<V>', '').replace('<s>', '').split('#')
        print('keywords keywords:', keywords)
        k = []
        for keyword in keywords:
            keyword = keyword.strip()
            k.append(keyword)
        print('keywords k:', k)
        keywords = k

        return keywords

    def top_k_top_p_filtering(self, logits, filter_value=-float('Inf')):
        """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
            Args:
                logits: logits distribution shape (batch size x vocabulary size)
                top_k > 0: keep only top k tokens with highest probability (top-k filtering).
                top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                    Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
            From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
        """
        top_k = min(self.top_k, logits.size(-1))  # Safety check
        if top_k > 0:
            # Remove all tokens with a probability less than the last token of the top-k
            indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1,
                                                                      None]
            logits[indices_to_remove] = filter_value

        if self.top_p > 0.0:
            sorted_logits, sorted_indices = torch.sort(logits, descending=True)
            cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1),
                                            dim=-1)

            # Remove tokens with cumulative probability above the threshold
            sorted_indices_to_remove = cumulative_probs > self.top_p
            # Shift the indices to the right to keep also the first token above the threshold
            sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                ..., :-1].clone()
            sorted_indices_to_remove[..., 0] = 0

            # scatter sorted tensors to original indexing
            indices_to_remove = sorted_indices_to_remove.scatter(
                dim=1, index=sorted_indices, src=sorted_indices_to_remove)
            logits[indices_to_remove] = filter_value
        return logits

    def sample_sequence(self, model, context):
        context = torch.tensor(context, dtype=torch.long, device=self.device)
        context = context.unsqueeze(0).repeat(1, 1)
        generated = context
        model.cuda()
        with torch.no_grad():
            for _ in trange(self.gen_length):
                inputs = {'input_ids': generated}
                outputs = model(
                    **inputs
                )  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
                next_token_logits = outputs[0][:, -1, :] / (
                    self.temperature if self.temperature > 0 else 1.)

                # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
                for i in range(1):
                    for _ in set(generated[i].tolist()):
                        next_token_logits[i, _] /= self.repetition_penalty

                filtered_logits = self.top_k_top_p_filtering(next_token_logits)
                next_token = torch.multinomial(F.softmax(filtered_logits,
                                                         dim=-1),
                                               num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)
        return generated

    def get_response(self, user_utterance, user_utt_topic, res_keywords):
        '''
        this method calls the dial_gen model and returns generated utterance 
        '''
        res_keywords = ' # '.join(res_keywords)
        input_dial_gen = user_utt_topic.strip(
        ) + ' <EOT> ' + user_utterance.strip() + ' <V> ' + res_keywords.strip(
        )
        print('input to dialog generation module')
        print(input_dial_gen)
        if self.gen_model_type == 'dialogpt':
            context_tokens = self.gen_tokenizer.encode(
                input_dial_gen, add_special_tokens=False)
            out = self.sample_sequence(model=self.gen_model,
                                       context=context_tokens)
            out = out[:, len(context_tokens):].tolist()
            response = self.gen_tokenizer.decode(
                out[0], clean_up_tokenization_spaces=True)
            response = response[:response.find('\n') if self.
                                stop_token else None]
        elif self.gen_model_type == 'bart':
            np.random.seed(4)
            torch.manual_seed(4)
            maxb = 128  #Can be customized
            minb = 15  #Can be customized
            response = ''
            slines = [input_dial_gen]
            with torch.no_grad():
                hypotheses = self.gen_model.sample(
                    slines,
                    sampling=True,
                    sampling_topk=self.top_k,
                    temperature=self.temperature,
                    lenpen=2.0,
                    max_len_b=maxb,
                    min_len=minb,
                    no_repeat_ngram_size=3)
            hypotheses = hypotheses[0]
            response = hypotheses.replace('\n', '')
        return response

예제 #12

0

파일 보기

    return app


if __name__ == '__main__':
    path = 'models/model_out/pytorch_model.bin'
    bucket_path = 'https://storage.cloud.google.com/boast-trained-models/activity_classifier/pytorch_model.bin'

    # fetch model from google storage if not exist
    if bucket_path is not None and not os.path.exists(path):
        # set env key
        if 'GOOGLE_APPLICATION_CREDENTIALS' not in os.environ:
            os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'gcp_auth.json'

        client = storage.Client()
        bucket = client.get_bucket('boast-trained-models')
        blob = bucket.get_blob('activity_classifier/pytorch_model.bin')

        print('Downloading model...')
        with open(path, 'wb') as file_obj:
            blob.download_to_file(file_obj)

    predictor = BertClassificationPredictor(
        model_path='models/model_out',
        label_path='train',
        multi_label=False,
        model_type='distilbert',
        do_lower_case=True)

    serve(create_app(predictor), host='0.0.0.0', port=5000)

예제 #13

0

파일 보기

from fast_bert.prediction import BertClassificationPredictor
import pandas as pd
import csv
import json
import copy

predictor = BertClassificationPredictor(
    model_path='./Data/labor/data/model/keda/model_out',
    label_path='./Data/labor/new_data',
    multi_label=True,
    model_type='bert')

text_list = list(pd.read_csv("./Data/labor/new_data/test.csv")['text'].values)
output = predictor.predict_batch(text_list)

print(output)

예제 #14

0

파일 보기

def main(model_uri: Param("S3 uri with NLP model", str),
         data_uri: Param("S3 uri with input csv file", str), result_uri: Param(
             "S3 uri where to put output csv file with added \
                                inference columns",
             str), inference_columns: Param(
                 "text columns separated in the csv file on \
                        which inference will be run", str)):
    try:
        local_model = download_uri(model_uri)
    except:
        print(f"Failed to download NLP model. Exiting...")
        sys.exit(2)

    try:
        local_csv = download_uri(data_uri)
    except:
        print(f"Failed to download input csv file. Exiting...")
        sys.exit(2)

    model_dir = Path("/tmp/model")
    model_dir.mkdir(exist_ok=True)

    out = subprocess.Popen(['tar', 'xzf', local_model, '-C', model_dir],
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)

    stdout, stderr = out.communicate()
    if not stderr:
        print("Model extacted sucessfully")
    else:
        print(stderr.decode('ascii'))
        print(f"Model extaction error. Exiting...")
        sys.exit(1)

    model_config = model_dir / 'model_config.json'
    with open(model_config) as f:
        config = json.load(f)

    print("Loading model")

    predictor = BertClassificationPredictor(
        model_path=str(model_dir / 'model_out'),
        label_path=str(model_dir),  # location for labels.csv file
        model_type=config['model_type'],
        multi_label=config['multi_label'],
        do_lower_case=config['do_lower_case'],
    )
    try:
        print("Loading input csv")
        df = pd.read_csv(local_csv)
    except:
        print("Failed to load input csv file. Exiting...")
        sys.exit(1)

    inference_columns = inference_columns.split(',')
    for c in inference_columns:
        if c not in df.columns:
            print(f"{c} is not a column name in input csv file. Exiting...")
            sys.exit(2)

    for c in inference_columns:

        print(f"Starting inference for {c} column")

        start = time.time()

        text = df.loc[~df[c].isna(), c].tolist()

        out = predictor.predict_batch(text)
        result = pd.DataFrame(list(map(dict, out)))
        for r in result.columns:
            df.loc[~df[c].isna(), f"{c}_{r}"] = result[r].tolist()

        print(f"Inference time for {len(text)} rows was {time.time() - start}")

    df.to_csv(local_csv, index=False)

    upload_uri(local_csv, result_uri)

    print("We are done with inference!")