def get_sentence_label_array(text, label_list: list, predictor: BertClassificationPredictor): '''输入一个句子, 输出 pio 的分类.''' ans_list = predictor.predict(text) ans_array = np.zeros(len(label_list)) for ans in ans_list: label = ans[0] index = label_list.index(label) ans_array[index] = ans[1] return ans_array
def predict(self, text): predictor = BertClassificationPredictor( model_path=self.in_dir + '/' + self.model_dir, label_path=self.in_dir + '/labels', # location for labels.csv file multi_label=True, # model_type='xlnet', do_lower_case=True) prediction = predictor.predict(str(text))[:7] rst_list = [] for i in range(len(prediction)): rst_list.append(" #" + str(prediction[i][0])) return rst_list
class SentimentAnalyzer(object): def __init__(self, model_path, label_path): self.predictor = BertClassificationPredictor( model_path=model_path, label_path=label_path, # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) self.preprocessor = TextPreprocessor() def predict_sentiment(self, tweet): tweet = self.preprocessor.process(tweet) print(tweet) prediction = self.predictor.predict(tweet) print(prediction) for label, confidence in prediction: if label == "0" and confidence >= 0.7: return "Negative" if label == "4" and confidence >= 0.7: return "Positive" return "Neutral" def batch_predict_sentiment(self, tweets): processed_tweets = [] for tweet in tweets: processed_tweets.append(self.preprocessor.process(tweet)) predictions = self.predictor.predict_batch(processed_tweets) print(predictions) results = [] for prediction in predictions: label_to_prob = dict(prediction) if label_to_prob["0"] >= 0.7: results.append("Negative") elif label_to_prob["4"] >= 0.7: results.append("Positive") else: results.append("Neutral") return results
def classify_bert(text, model_path): """Classify genre using fast-bert. Fast-bert automatically uses GPU if `torch.cuda.is_available() == True` Parameters ----------- text : <str or list(str)> for single prediction or multiprediction model_path : <str> must contain labels.csv (I've put one in the uploaded version) AND all model files (config.json, pytorch_model.bin, special_tokens_map.json, tokenizer_config.json, vocab.txt) Returns --------- str: if type(text) == str list: if type(text) == list or numpy array """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") predictor = BertClassificationPredictor( model_path=model_path, label_path=model_path, # location for labels.csv file multi_label=True, model_type='bert', do_lower_case=False) # predictor.to(device) if isinstance(text, str): # Single prediction pred = predictor.predict(text) pred = dict(pred) # single_prediction = predictor.predict("just get me result for this text") elif isinstance(text, list) or isinstance(text, np.ndarray): pred = predictor.predict_batch(text) # # Batch predictions # texts = [ # "this is the first text", # "this is the second text" # ] for i in range(len(pred)): pred[i] = dict(pred[i]) # multiple_predictions = predictor.predict_batch(texts) else: raise ValueError("Unexpected type for input argument `text`") return pred
LABEL_PATH = "label" predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path=LABEL_PATH, # location for labels.csv file multi_label=True, model_type='bert', do_lower_case=True) # Single prediction # single_prediction = predictor.predict("where to get food") # print(single_prediction) for i in range(10): text = input("Enter: ") if len(text) < 2: break single_prediction = predictor.predict(text) print(single_prediction) print() # # Batch predictions # texts = [ # "this is the first text", # "this is the second text" # ] # multiple_predictions = predictor.predict_batch(texts) # print(multiple_predictions)
#!/usr/bin/env python # -*- coding: utf-8 -*- import os, sys import pandas as pd from fast_bert.prediction import BertClassificationPredictor import pickle import json MODEL_PATH = 'output/model_out/' predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path='./', # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) #multi prediction test_data = pd.read_csv('dev.csv') x = 0 for item in test_data.text: prediction = predictor.predict(item) with open('predictions.tsv', 'a') as fp: #print(str(test_data.id[x]) + '\t' + prediction[0][0][0] + '\t' + prediction[0][0][1] + '\n') fp.write( str(test_data.id[x]) + '\t' + prediction[0][0][0] + '\t' + prediction[0][0][1] + '\n') x = x + 1
from fast_bert.prediction import BertClassificationPredictor from pathlib import Path DATA_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/data/') LABEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/labels/') MODEL_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/models/') LOG_PATH = Path('/home/dpappas/fast_bert_models/doc_rerank/logs/') # location for the pretrained BERT models BERT_PRETRAINED_PATH = Path( '../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12/') predictor = BertClassificationPredictor(model_path=MODEL_PATH, pretrained_path=BERT_PRETRAINED_PATH, label_path=LABEL_PATH, multi_label=False) # Single prediction single_prediction = predictor.predict("just get me result for this text") # Batch predictions texts = ["this is the first text", "this is the second text"] multiple_predictions = predictor.predict(texts)
class Interaction(): def __init__(self, args): self.gen_model_type = args['gen_model_type'] self.gen_model_path = args['gen_model_path'].replace('"', '') self.conv_line_path = args['conv_line_path'].replace('"', '') self.gen_length = args['length'] self.temperature = args['temperature'] self.top_k = args['top_k'] self.top_p = args['top_p'] self.stop_token = args['stop_token'] self.repetition_penalty = args['repetition_penalty'] self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") #self.device = torch.device("cpu") self.gen_model_type = self.gen_model_type.lower() self.lookup = { '1': 'Fashion', '2': 'Politics', '3': 'Books', '4': 'Sports', '5': 'General Entertainment', '6': 'Music', '7': 'Science & Technology', '8': 'Movie', '9': 'General' } self.topic_cls = BertClassificationPredictor( model_path=args['topic_cls_path'].replace('"', ''), label_path=args['label_dir'].replace( '"', ''), #sys.argv[2], # directory for labels.csv file multi_label=False, model_type='bert', do_lower_case=True) self.entity_ext_model = AutoModelForTokenClassification.from_pretrained( "dbmdz/bert-large-cased-finetuned-conll03-english") #self.entity_ext_model.to(self.device) self.entity_ext_model.to('cpu') self.entity_ext_tokenizer = AutoTokenizer.from_pretrained( "bert-base-cased") if self.gen_model_type == 'dialogpt': self.gen_tokenizer = AutoTokenizer.from_pretrained( self.gen_model_path) self.gen_model = AutoModelWithLMHead.from_pretrained( self.gen_model_path) self.gen_model.cuda() self.gen_model.eval() elif self.gen_model_type == 'bart': self.gen_model = BARTModel.from_pretrained( self.gen_model_path, checkpoint_file='checkpoint_best.pt', data_name_or_path=self.gen_model_path) self.gen_model.cuda() self.gen_model.eval() self.conv_line = BARTModel.from_pretrained( self.conv_line_path, checkpoint_file='checkpoint_best.pt', data_name_or_path=self.conv_line_path) self.conv_line.cuda() self.conv_line.eval() self.baseline_tokenizer = AutoTokenizer.from_pretrained( args['baseline']) self.baseline_model = AutoModelForCausalLM.from_pretrained( args['baseline']) #self.baseline_model.to('cpu') self.baseline_model.cuda() self.baseline_model.eval() def baseline_decode(self, user_utt): print('baseline decode') print(user_utt) #new_user_input_ids = self.baseline_tokenizer.encode( # user_utt + self.baseline_tokenizer.eos_token, return_tensors='pt', max_length=128).to('cpu') new_user_input_ids = self.baseline_tokenizer.encode( user_utt + self.baseline_tokenizer.eos_token, return_tensors='pt', max_length=128).cuda() if user_utt == "BEGIN": np.random.seed(random.randint(0, 120)) torch.manual_seed(random.randint(0, 120)) chat_history_ids = self.baseline_model.generate( new_user_input_ids, max_length=60, top_k=10, top_p=0.70, pad_token_id=self.baseline_tokenizer.eos_token_id) utterance = self.baseline_tokenizer.decode( chat_history_ids[0], skip_special_tokens=True) utterance = utterance.replace('BEGIN', '').strip() if ' <EOT> ' in utterance: print('utterance', utterance) utterance = utterance.split(' <EOT> ')[0] else: np.random.seed(4) torch.manual_seed(4) chat_history_ids = self.baseline_model.generate( new_user_input_ids, max_length=60, top_k=10, top_p=0.70, pad_token_id=self.baseline_tokenizer.eos_token_id) utterance = self.baseline_tokenizer.decode( chat_history_ids[0], skip_special_tokens=True) if ' <EOT> ' in utterance: print('utterance', utterance) utterance = utterance.split(' <EOT> ')[1].strip() return utterance def get_topic(self, utterance): ''' this method calls the topic cls and returns utterace's topic ''' print('topic input:', utterance) topic = self.lookup[self.topic_cls.predict(utterance)[0][0]] print('predict topic:', topic) return topic def get_entities(self, utterance): ''' this method calls the entity extractor model and returns utterace's entities ''' entities = '' label_list = [ "O", # Outside of a named entity "B-MISC", # Beginning of a miscellaneous entity right after another miscellaneous entity "I-MISC", # Miscellaneous entity "B-PER", # Beginning of a person's name right after another person's name "I-PER", # Person's name "B-ORG", # Beginning of an organisation right after another organisation "I-ORG", # Organisation "B-LOC", # Beginning of a location right after another location "I-LOC" # Location ] # Bit of a hack to get the tokens with the special tokens tokens = self.entity_ext_tokenizer.tokenize( self.entity_ext_tokenizer.decode( self.entity_ext_tokenizer.encode(utterance))) inputs = self.entity_ext_tokenizer.encode( utterance, return_tensors="pt").to('cpu') outputs = self.entity_ext_model(inputs)[0] predictions = torch.argmax(outputs, dim=2) entity = [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist()) ] # delete '##' before tokens r = [] r_tags = [] for i, tpl in enumerate(entity): if tpl[0].startswith("##"): if r: r[-1] += tpl[0][2:] else: r.append(tpl[0]) r_tags.append(tpl[1]) new_entity_token = [(i, j) for i, j in zip(r, r_tags)] # combine tokens into entities flag = False entities = [] ent_tags = [] for i, tpl in enumerate(new_entity_token): if tpl[1] == "O": flag = False continue elif tpl[1] == "I-MISC" or tpl[1] == "I-PER" or tpl[ 1] == "I-ORG" or tpl[1] == "I-LOC": if flag == False: flag = True entities.append(tpl[0]) ent_tags.append(tpl[1]) else: entities[-1] += ' ' entities[-1] += tpl[0] elif tpl[1] == "B-MISC" or tpl[1] == "B-PER" or tpl[ 1] == "B-ORG" or tpl[1] == "B-LOC": entities.append(tpl[0]) ent_tags.append(tpl[1]) return entities def get_response_keywords(self, utterance, topic, entities, randomness=False): ''' this method calls the conv_line model and returns response keywords ''' entities_comb = ' # '.join(entities) input_conv = topic + ' <EOT> ' + utterance + ' <A0> ' + entities_comb + '<A1>' ''' this method calls the conv_line model and returns response keywords ''' print('input to conv_line') print(input_conv) if randomness == False: np.random.seed(4) torch.manual_seed(4) elif randomness == True: np.random.seed(random.randint(0, 120)) torch.manual_seed(random.randint(0, 120)) maxb = 30 #Can be customized minb = 7 #Can be customized response = '' slines = [input_conv] with torch.no_grad(): #hypotheses = self.conv_line.sample(slines, beam=4, lenpen=2.0, no_repeat_ngram_size=3) hypotheses = self.conv_line.sample(slines, sampling=True, sampling_topk=5, temperature=0.7, lenpen=2.0, max_len_b=maxb, min_len=minb, no_repeat_ngram_size=3) hypotheses = hypotheses[0] print('keywords hypotheses:', hypotheses) response = hypotheses.replace('\n', '') keywords = response.replace('<V>', '').replace('<s>', '').split('#') print('keywords keywords:', keywords) k = [] for keyword in keywords: keyword = keyword.strip() k.append(keyword) print('keywords k:', k) keywords = k return keywords def top_k_top_p_filtering(self, logits, filter_value=-float('Inf')): """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size x vocabulary size) top_k > 0: keep only top k tokens with highest probability (top-k filtering). top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ top_k = min(self.top_k, logits.size(-1)) # Safety check if top_k > 0: # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value if self.top_p > 0.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) # Remove tokens with cumulative probability above the threshold sorted_indices_to_remove = cumulative_probs > self.top_p # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[ ..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 # scatter sorted tensors to original indexing indices_to_remove = sorted_indices_to_remove.scatter( dim=1, index=sorted_indices, src=sorted_indices_to_remove) logits[indices_to_remove] = filter_value return logits def sample_sequence(self, model, context): context = torch.tensor(context, dtype=torch.long, device=self.device) context = context.unsqueeze(0).repeat(1, 1) generated = context model.cuda() with torch.no_grad(): for _ in trange(self.gen_length): inputs = {'input_ids': generated} outputs = model( **inputs ) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states) next_token_logits = outputs[0][:, -1, :] / ( self.temperature if self.temperature > 0 else 1.) # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858) for i in range(1): for _ in set(generated[i].tolist()): next_token_logits[i, _] /= self.repetition_penalty filtered_logits = self.top_k_top_p_filtering(next_token_logits) next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) generated = torch.cat((generated, next_token), dim=1) return generated def get_response(self, user_utterance, user_utt_topic, res_keywords): ''' this method calls the dial_gen model and returns generated utterance ''' res_keywords = ' # '.join(res_keywords) input_dial_gen = user_utt_topic.strip( ) + ' <EOT> ' + user_utterance.strip() + ' <V> ' + res_keywords.strip( ) print('input to dialog generation module') print(input_dial_gen) if self.gen_model_type == 'dialogpt': context_tokens = self.gen_tokenizer.encode( input_dial_gen, add_special_tokens=False) out = self.sample_sequence(model=self.gen_model, context=context_tokens) out = out[:, len(context_tokens):].tolist() response = self.gen_tokenizer.decode( out[0], clean_up_tokenization_spaces=True) response = response[:response.find('\n') if self. stop_token else None] elif self.gen_model_type == 'bart': np.random.seed(4) torch.manual_seed(4) maxb = 128 #Can be customized minb = 15 #Can be customized response = '' slines = [input_dial_gen] with torch.no_grad(): hypotheses = self.gen_model.sample( slines, sampling=True, sampling_topk=self.top_k, temperature=self.temperature, lenpen=2.0, max_len_b=maxb, min_len=minb, no_repeat_ngram_size=3) hypotheses = hypotheses[0] response = hypotheses.replace('\n', '') return response