def predict_proba(self, x): print('\tpredicting probabilities...') # create pandas dataframe df = pd.DataFrame({'comment_text': x}) df.to_csv(self.experiment_parameters.DATA_PATH/'test.csv') # create predictor object output_dir = 'output-%d/model_out'%self.experiment_parameters.random_state predictor = BertClassificationPredictor(model_path=(self.experiment_parameters.MODEL_PATH/output_dir).absolute().as_posix(), label_path=self.experiment_parameters.LABEL_PATH, multi_label=True, model_type=self.experiment_parameters.MODEL_TYPE, do_lower_case=True) # predict test labels output = predictor.predict_batch(list(pd.read_csv(self.experiment_parameters.DATA_PATH/'test.csv')['comment_text'].values)) # dump results pd.DataFrame(output).to_csv(self.experiment_parameters.PRED_PATH/self.experiment_parameters.RESULTS_FILENAME) # clean output preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output]) print(preds.head(5)) y_pred_prob = preds.values.reshape(-1) y_pred_prob = pd.Series(y_pred_prob) return y_pred_prob
def run(model, csvs, threshold, evaluation): labels = [ "anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust", "neutral" ] predictor = BertClassificationPredictor( model_path=args.model_dir, label_path= "D:\\UTD\\Assignment\\NLP\\project\\", # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) inputs = {} ids = [] data = pd.read_csv(csvs) # print(data.head()) for idx, row in data.iterrows(): temp = [] for label in labels: if row[label] == 1: temp.append(label) inputs[row['text']] = temp ids.append(row['id']) multiple_predictions = predictor.predict_batch(list(inputs.keys())) outputs = [] out_file = open(os.path.join(os.path.dirname(csvs), "model_output.csv"), "w", encoding="utf-8", newline="") csv_writer = csv.writer(out_file) csv_writer.writerow(["id", "text", "emotions", "target"]) for i, out in enumerate(multiple_predictions): temp = [] for emotion in out: if emotion[1] > threshold: # greater than threshold temp.append(emotion[0]) csv_writer.writerow( [ids[i], list(inputs.keys())[i], temp, list(inputs.values())[i]]) outputs.append(temp) print("****************\n") print("Predictions saved in a file: ", os.path.join(os.path.dirname(csvs), "model_output.csv")) if evaluation: print("\n\n Running Model Evaluation\n") y_true = list(inputs.values()) y_pred = outputs y_true_encoded = MultiLabelBinarizer().fit_transform(y_true) y_pred_encoded = MultiLabelBinarizer().fit_transform(y_pred) pprint(classification_report(y_true_encoded, y_pred_encoded)) pprint( classification_report(y_true_encoded, y_pred_encoded, target_names=labels))
class TeacherNLPClassifier(torch.nn.Module): def __init__(self, model_dir, label_to_idx=label_to_idx): super().__init__() model_dir = Path(model_dir) model_config = model_dir / 'model_config.json' with open(model_config) as f: config = json.load(f) self.model = BertClassificationPredictor( model_path=str(model_dir / 'model_out'), label_path=str(model_dir), # location for labels.csv file model_type=config['model_type'], multi_label=config['multi_label'], do_lower_case=config['do_lower_case'], ) self.label_to_idx = label_to_idx def forward(self, texts: List[str], lengths=None): results = self.model.predict_batch(texts) # results is a List[List[Tuple]] of `label, probability`. # convert this to a onehot tensor final = torch.zeros((len(results), len(self.label_to_idx))) for i, result in enumerate(results): for (label, prob) in result: idx = self.label_to_idx[label] final[i, idx] = prob return final
def threshold(model, csvs): labels = [ "anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust", "neutral" ] predictor = BertClassificationPredictor( model_path=args.model_dir, label_path= "D:\\UTD\\Assignment\\NLP\\project\\", # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) thresholds = [ 0.0005, 0.00077, 0.00079, 0.00083, 0.00087, 0.0009, 0.00093, 0.00095, 0.00099, 0.001, 0.0012, 0.0015, 0.00155, 0.0016, 0.00166, 0.0017, 0.0019, 0.002, 0.0021, 0.0023, 0.0025, 0.0028, 0.003, 0.0035, 0.0032, 0.0037, 0.004, 0.0045, 0.0047, 0.0041, 0.005, 0.0053, 0.0055, 0.0062, 0.009, 0.007, 0.01, 0.011, 0.013, 0.014, 0.012, 0.015, 0.02, 0.25, 0.03, 0.035, 0.039 ] # targets = [] inputs = {} data = pd.read_csv(csvs) # print(data.head()) for idx, row in data.iterrows(): temp = [] for label in labels: if row[label] == 1: temp.append(label) inputs[row['text']] = temp multiple_predictions = predictor.predict_batch(list(inputs.keys())) threshold_accs = {} for th in thresholds: correct = 0 # print(list(inputs.values())[0]) outputs = [] for out in multiple_predictions: temp = [] for emotion in out: if emotion[1] >= th: # greater than threshold temp.append(emotion[0]) outputs.append(temp) # print(outputs[0]) for i in range(len(inputs)): if (set(outputs[i]) == set(list(inputs.values())[i])): correct += 1 print("Threshold: ", th, "Correct: ", correct) threshold_accs[str(th)] = correct / len(inputs) print(threshold_accs)
class SentimentAnalyzer(object): def __init__(self, model_path, label_path): self.predictor = BertClassificationPredictor( model_path=model_path, label_path=label_path, # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) self.preprocessor = TextPreprocessor() def predict_sentiment(self, tweet): tweet = self.preprocessor.process(tweet) print(tweet) prediction = self.predictor.predict(tweet) print(prediction) for label, confidence in prediction: if label == "0" and confidence >= 0.7: return "Negative" if label == "4" and confidence >= 0.7: return "Positive" return "Neutral" def batch_predict_sentiment(self, tweets): processed_tweets = [] for tweet in tweets: processed_tweets.append(self.preprocessor.process(tweet)) predictions = self.predictor.predict_batch(processed_tweets) print(predictions) results = [] for prediction in predictions: label_to_prob = dict(prediction) if label_to_prob["0"] >= 0.7: results.append("Negative") elif label_to_prob["4"] >= 0.7: results.append("Positive") else: results.append("Neutral") return results
def classify_bert(text, model_path): """Classify genre using fast-bert. Fast-bert automatically uses GPU if `torch.cuda.is_available() == True` Parameters ----------- text : <str or list(str)> for single prediction or multiprediction model_path : <str> must contain labels.csv (I've put one in the uploaded version) AND all model files (config.json, pytorch_model.bin, special_tokens_map.json, tokenizer_config.json, vocab.txt) Returns --------- str: if type(text) == str list: if type(text) == list or numpy array """ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") predictor = BertClassificationPredictor( model_path=model_path, label_path=model_path, # location for labels.csv file multi_label=True, model_type='bert', do_lower_case=False) # predictor.to(device) if isinstance(text, str): # Single prediction pred = predictor.predict(text) pred = dict(pred) # single_prediction = predictor.predict("just get me result for this text") elif isinstance(text, list) or isinstance(text, np.ndarray): pred = predictor.predict_batch(text) # # Batch predictions # texts = [ # "this is the first text", # "this is the second text" # ] for i in range(len(pred)): pred[i] = dict(pred[i]) # multiple_predictions = predictor.predict_batch(texts) else: raise ValueError("Unexpected type for input argument `text`") return pred
def prediction(text_list: list, case_type: str): if case_type == 'divorce': model_path = '/home/zf/lyy/Data/divorce/data/model/kedaV3/model_out' label_path = '/home/zf/lyy/Data/divorce/data/new_data' elif case_type == 'loan': model_path = '/home/zf/lyy/Data/loan/data/model/keda/model_out' label_path = '/home/zf/lyy/Data/loan/new_data' elif case_type == 'labor': model_path = '/home/zf/lyy/Data/labor/data/model/keda/model_out' label_path = '/home/zf/lyy/Data/labor/new_data' else: raise Exception('No this type') tag_dict = build_tags_dict(case_type) predictor = BertClassificationPredictor(model_path=model_path, label_path=label_path, multi_label=True, model_type='bert') output = predictor.predict_batch(text_list) detail_dic = {} result_dic = {} for i in range(len(output)): lab_list = [] for key in output[i]: if float(key[1]) > 0.5: if key[0] not in result_dic.keys(): result_dic[key[0]] = 1 else: result_dic[key[0]] += 1 if '21' not in key[0]: lab_list.append(tag_dict[key[0]]) detail_dic[text_list[i]] = lab_list final_result_dic = {} for key in result_dic.keys(): if '21' not in key: final_result_dic[tag_dict[key]] = result_dic[key] return final_result_dic, detail_dic
def predict_bert(experiment_parameters): # create predictor object predictor = BertClassificationPredictor(model_path=(experiment_parameters.MODEL_PATH/'output/model_out').absolute().as_posix(), label_path=experiment_parameters.LABEL_PATH, multi_label=True, model_type=experiment_parameters.MODEL_TYPE, do_lower_case=True) # predict test labels output = predictor.predict_batch(list(pd.read_csv(experiment_parameters.DATA_PATH/'test.csv')['comment_text'].values)) # dump results pd.DataFrame(output).to_csv(experiment_parameters.PRED_PATH/experiment_parameters.RESULTS_FILENAME) # clean output preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output]) print(preds.head()) # load test data df_test = pd.read_csv(experiment_parameters.DATA_PATH/'test.csv') print(df_test.head()) # merge dataframes df_pred = pd.merge(df_test, preds, how='left', left_index=True, right_index=True) del df_pred['comment_text'] #df_pred = df_pred['id', 'obscene'] df_pred['ground_truth'] = df_pred['%s_x'%LABEL_COLS[0]] df_pred['pred_prob'] = df_pred['%s_y'%LABEL_COLS[0]] del df_pred['%s_x'%LABEL_COLS[0]] del df_pred['%s_y'%LABEL_COLS[0]] print(df_pred.head()) # write results to file df_pred.to_csv(experiment_parameters.PRED_PATH/experiment_parameters.RESULTS_FILENAME, index=None) return
args.multi_gpu = True else: args.multi_gpu = False label_cols = ["functionality", "range_anxiety", "availability", "cost", "ui", "location", "service_time", "dealership"] databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train_final.csv', val_file='valid_final.csv', test_data='test_final.csv', text_col="review", label_col=label_cols, batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type) databunch.train_dl.dataset[0][3] num_labels = len(databunch.labels) print(num_labels) metrics = [] metrics.append({'name': 'accuracy', 'function': accuracy}) metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh}) metrics.append({'name': 'roc_auc', 'function': roc_auc}) metrics.append({'name': 'fbeta', 'function': fbeta}) print(device) MODEL_PATH = '../models/output/model_out/' LABEL_PATH = '.' predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path=LABEL_PATH, # location for labels.csv file multi_label=True, model_type='xlnet', do_lower_case=False) predictions = predictor.predict_batch(list(pd.read_csv('test_final.csv')['review'].values))
parser.add_argument('--file_out', type=str, default='') args = parser.parse_args() return args if __name__ == '__main__': args = get_args() print(args.file_in) OUTPUT_DIR = './output/' MODEL_PATH = OUTPUT_DIR + args.model_path+'/model_out' LABEL_PATH = './' predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path=LABEL_PATH, # location for labels.csv file multi_label=False, model_type='bert', do_lower_case=False) df = pd.read_csv(LABEL_PATH + args.file_in, encoding='utf-8') texts = df.text.tolist() multiple_predictions = predictor.predict_batch(texts) # print(multiple_predictions) # print(type(multiple_predictions)) with open('./predictions/'+args.file_out, 'w') as filehandle: for listitem in multiple_predictions: filehandle.write('%s\n' % listitem)
predictor = BertClassificationPredictor( model_path=MODEL_PATH, label_path=args.label_dir, #sys.argv[2], # directory for labels.csv file multi_label=False, model_type='bert', do_lower_case=True) INPUT = os.path.join('data', args.input_name + '.csv') texts = list(csv.reader(open(INPUT, 'rt'))) # sys.argv[3] batchsize = args.batch_size multiple_predictions = [] for i in tqdm(range(1, len(texts), batchsize)): batch_texts = [] if i + batchsize > len(texts): for j in range(i, len(texts)): batch_texts.append(texts[j][0]) tmp_pred = predictor.predict_batch(batch_texts) multiple_predictions.extend(tmp_pred) else: for j in range(i, i + batchsize): batch_texts.append(texts[j][0]) tmp_pred = predictor.predict_batch(batch_texts) multiple_predictions.extend(tmp_pred) #multiple_predictions = predictor.predict_batch(i[0] for i in texts[1:]) if not os.path.exists(args.output_dir): os.system('mkdir ' + args.output_dir) curr_time = datetime.datetime.now() time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d_%H:%M:%S') time_str = time_str + '-' + args.dataset os.system('mkdir ' + os.path.join(args.output_dir, time_str)) os.system('cp data/*.csv data/*_idx ' + os.path.join(args.output_dir, time_str))
# 设定索要的标签 label_cols = (sentence_labels if train_for == 'sentence' else fine_grained_labels) # 开始构建预测模型 predictor = BertClassificationPredictor(model_path=args.output_dir / 'model_out', label_path=LABEL_PATH, multi_label=True, model_type=args.model_type, do_lower_case=True) # 获取测试数据 output = predictor.predict_batch( list( pd.read_csv(str( DATA_PATH.joinpath('test.csv').absolute()))['text'].values)) # 将预测结果输出 pd.DataFrame(output).to_csv( str(DATA_PATH.joinpath('output_bert.csv').absolute())) # 预测结果读入 results = pd.read_csv(str(DATA_PATH.joinpath('output_bert.csv').absolute())) # 预测结果构成一个 pd 对象 preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output]) print(preds.head()) test_df = pd.read_csv(str(DATA_PATH.joinpath('test.csv').absolute())) print(test_df.head())
from fast_bert.prediction import BertClassificationPredictor import pandas as pd import csv import json import copy predictor = BertClassificationPredictor( model_path='./Data/labor/data/model/keda/model_out', label_path='./Data/labor/new_data', multi_label=True, model_type='bert') text_list = list(pd.read_csv("./Data/labor/new_data/test.csv")['text'].values) output = predictor.predict_batch(text_list) print(output)
def main(model_uri: Param("S3 uri with NLP model", str), data_uri: Param("S3 uri with input csv file", str), result_uri: Param( "S3 uri where to put output csv file with added \ inference columns", str), inference_columns: Param( "text columns separated in the csv file on \ which inference will be run", str)): try: local_model = download_uri(model_uri) except: print(f"Failed to download NLP model. Exiting...") sys.exit(2) try: local_csv = download_uri(data_uri) except: print(f"Failed to download input csv file. Exiting...") sys.exit(2) model_dir = Path("/tmp/model") model_dir.mkdir(exist_ok=True) out = subprocess.Popen(['tar', 'xzf', local_model, '-C', model_dir], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = out.communicate() if not stderr: print("Model extacted sucessfully") else: print(stderr.decode('ascii')) print(f"Model extaction error. Exiting...") sys.exit(1) model_config = model_dir / 'model_config.json' with open(model_config) as f: config = json.load(f) print("Loading model") predictor = BertClassificationPredictor( model_path=str(model_dir / 'model_out'), label_path=str(model_dir), # location for labels.csv file model_type=config['model_type'], multi_label=config['multi_label'], do_lower_case=config['do_lower_case'], ) try: print("Loading input csv") df = pd.read_csv(local_csv) except: print("Failed to load input csv file. Exiting...") sys.exit(1) inference_columns = inference_columns.split(',') for c in inference_columns: if c not in df.columns: print(f"{c} is not a column name in input csv file. Exiting...") sys.exit(2) for c in inference_columns: print(f"Starting inference for {c} column") start = time.time() text = df.loc[~df[c].isna(), c].tolist() out = predictor.predict_batch(text) result = pd.DataFrame(list(map(dict, out))) for r in result.columns: df.loc[~df[c].isna(), f"{c}_{r}"] = result[r].tolist() print(f"Inference time for {len(text)} rows was {time.time() - start}") df.to_csv(local_csv, index=False) upload_uri(local_csv, result_uri) print("We are done with inference!")