예제 #1
0
    def predict_proba(self, x):
        print('\tpredicting probabilities...')
        # create pandas dataframe
        df = pd.DataFrame({'comment_text': x})
        df.to_csv(self.experiment_parameters.DATA_PATH/'test.csv')

        # create predictor object
        output_dir = 'output-%d/model_out'%self.experiment_parameters.random_state
        predictor = BertClassificationPredictor(model_path=(self.experiment_parameters.MODEL_PATH/output_dir).absolute().as_posix(),
                                            label_path=self.experiment_parameters.LABEL_PATH,
                                            multi_label=True,
                                            model_type=self.experiment_parameters.MODEL_TYPE,
                                            do_lower_case=True)

        # predict test labels
        output = predictor.predict_batch(list(pd.read_csv(self.experiment_parameters.DATA_PATH/'test.csv')['comment_text'].values))

        # dump results
        pd.DataFrame(output).to_csv(self.experiment_parameters.PRED_PATH/self.experiment_parameters.RESULTS_FILENAME)

        # clean output
        preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output])
        print(preds.head(5))

        y_pred_prob = preds.values.reshape(-1)
        y_pred_prob = pd.Series(y_pred_prob)
        return y_pred_prob
예제 #2
0
def run(model, csvs, threshold, evaluation):
    labels = [
        "anger", "anticipation", "disgust", "fear", "joy", "love", "optimism",
        "pessimism", "sadness", "surprise", "trust", "neutral"
    ]

    predictor = BertClassificationPredictor(
        model_path=args.model_dir,
        label_path=
        "D:\\UTD\\Assignment\\NLP\\project\\",  # location for labels.csv file
        multi_label=False,
        model_type='bert',
        do_lower_case=False)

    inputs = {}
    ids = []
    data = pd.read_csv(csvs)
    # print(data.head())
    for idx, row in data.iterrows():
        temp = []
        for label in labels:
            if row[label] == 1:
                temp.append(label)
        inputs[row['text']] = temp
        ids.append(row['id'])

    multiple_predictions = predictor.predict_batch(list(inputs.keys()))
    outputs = []
    out_file = open(os.path.join(os.path.dirname(csvs), "model_output.csv"),
                    "w",
                    encoding="utf-8",
                    newline="")
    csv_writer = csv.writer(out_file)
    csv_writer.writerow(["id", "text", "emotions", "target"])

    for i, out in enumerate(multiple_predictions):
        temp = []
        for emotion in out:
            if emotion[1] > threshold:  # greater than threshold
                temp.append(emotion[0])
        csv_writer.writerow(
            [ids[i],
             list(inputs.keys())[i], temp,
             list(inputs.values())[i]])
        outputs.append(temp)

    print("****************\n")
    print("Predictions saved in a file: ",
          os.path.join(os.path.dirname(csvs), "model_output.csv"))
    if evaluation:
        print("\n\n Running Model Evaluation\n")
        y_true = list(inputs.values())
        y_pred = outputs
        y_true_encoded = MultiLabelBinarizer().fit_transform(y_true)
        y_pred_encoded = MultiLabelBinarizer().fit_transform(y_pred)
        pprint(classification_report(y_true_encoded, y_pred_encoded))
        pprint(
            classification_report(y_true_encoded,
                                  y_pred_encoded,
                                  target_names=labels))
예제 #3
0
class TeacherNLPClassifier(torch.nn.Module):
    def __init__(self, model_dir, label_to_idx=label_to_idx):
        super().__init__()
        model_dir = Path(model_dir)
        model_config = model_dir / 'model_config.json'
        with open(model_config) as f:
            config = json.load(f)

        self.model = BertClassificationPredictor(
            model_path=str(model_dir / 'model_out'),
            label_path=str(model_dir),  # location for labels.csv file
            model_type=config['model_type'],
            multi_label=config['multi_label'],
            do_lower_case=config['do_lower_case'],
        )
        self.label_to_idx = label_to_idx

    def forward(self, texts: List[str], lengths=None):
        results = self.model.predict_batch(texts)
        # results is a List[List[Tuple]] of `label, probability`.
        # convert this to a onehot tensor
        final = torch.zeros((len(results), len(self.label_to_idx)))
        for i, result in enumerate(results):
            for (label, prob) in result:
                idx = self.label_to_idx[label]
                final[i, idx] = prob
        return final
예제 #4
0
def threshold(model, csvs):
    labels = [
        "anger", "anticipation", "disgust", "fear", "joy", "love", "optimism",
        "pessimism", "sadness", "surprise", "trust", "neutral"
    ]

    predictor = BertClassificationPredictor(
        model_path=args.model_dir,
        label_path=
        "D:\\UTD\\Assignment\\NLP\\project\\",  # location for labels.csv file
        multi_label=False,
        model_type='bert',
        do_lower_case=False)
    thresholds = [
        0.0005, 0.00077, 0.00079, 0.00083, 0.00087, 0.0009, 0.00093, 0.00095,
        0.00099, 0.001, 0.0012, 0.0015, 0.00155, 0.0016, 0.00166, 0.0017,
        0.0019, 0.002, 0.0021, 0.0023, 0.0025, 0.0028, 0.003, 0.0035, 0.0032,
        0.0037, 0.004, 0.0045, 0.0047, 0.0041, 0.005, 0.0053, 0.0055, 0.0062,
        0.009, 0.007, 0.01, 0.011, 0.013, 0.014, 0.012, 0.015, 0.02, 0.25,
        0.03, 0.035, 0.039
    ]
    # targets = []
    inputs = {}
    data = pd.read_csv(csvs)
    # print(data.head())
    for idx, row in data.iterrows():
        temp = []
        for label in labels:
            if row[label] == 1:
                temp.append(label)
        inputs[row['text']] = temp

    multiple_predictions = predictor.predict_batch(list(inputs.keys()))
    threshold_accs = {}

    for th in thresholds:
        correct = 0
        # print(list(inputs.values())[0])
        outputs = []
        for out in multiple_predictions:
            temp = []
            for emotion in out:
                if emotion[1] >= th:  # greater than threshold
                    temp.append(emotion[0])
            outputs.append(temp)
        # print(outputs[0])
        for i in range(len(inputs)):
            if (set(outputs[i]) == set(list(inputs.values())[i])):
                correct += 1
        print("Threshold: ", th, "Correct: ", correct)
        threshold_accs[str(th)] = correct / len(inputs)
    print(threshold_accs)
예제 #5
0
class SentimentAnalyzer(object):
    def __init__(self, model_path, label_path):
        self.predictor = BertClassificationPredictor(
                        model_path=model_path,
                        label_path=label_path, # location for labels.csv file
                        multi_label=False,
                        model_type='bert',
                        do_lower_case=False)
        self.preprocessor = TextPreprocessor()


    def predict_sentiment(self, tweet):
        tweet = self.preprocessor.process(tweet)
        print(tweet)
        prediction = self.predictor.predict(tweet)
        print(prediction)
        for label, confidence in prediction:
            if label == "0" and confidence >= 0.7:
                return "Negative"

            if label == "4" and confidence >= 0.7:
                return "Positive"

        return "Neutral"

    def batch_predict_sentiment(self, tweets):
        processed_tweets = []

        for tweet in tweets:
            processed_tweets.append(self.preprocessor.process(tweet))

        predictions = self.predictor.predict_batch(processed_tweets)
        print(predictions)
        results = []

        for prediction in predictions:
            label_to_prob = dict(prediction)

            if label_to_prob["0"] >= 0.7:
                results.append("Negative")
            elif label_to_prob["4"] >= 0.7:
                results.append("Positive")
            else:
                results.append("Neutral")

        return results
def classify_bert(text, model_path):
    """Classify genre using fast-bert.

    Fast-bert automatically uses GPU if `torch.cuda.is_available() == True`

    Parameters
    -----------
    text : <str or list(str)> for single prediction or multiprediction 
    model_path : <str> must contain labels.csv (I've put one in the uploaded version)
            AND all model files (config.json, pytorch_model.bin, special_tokens_map.json, tokenizer_config.json, vocab.txt)

    Returns
    ---------
    str: if type(text) == str
    list: if type(text) == list or numpy array

    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    predictor = BertClassificationPredictor(
        model_path=model_path,
        label_path=model_path,  # location for labels.csv file
        multi_label=True,
        model_type='bert',
        do_lower_case=False)
    # predictor.to(device)

    if isinstance(text, str):
        # Single prediction
        pred = predictor.predict(text)
        pred = dict(pred)
        # single_prediction = predictor.predict("just get me result for this text")
    elif isinstance(text, list) or isinstance(text, np.ndarray):
        pred = predictor.predict_batch(text)
        # # Batch predictions
        # texts = [
        #     "this is the first text",
        #     "this is the second text"
        #     ]
        for i in range(len(pred)):
            pred[i] = dict(pred[i])

        # multiple_predictions = predictor.predict_batch(texts)
    else:
        raise ValueError("Unexpected type for input argument `text`")
    return pred
def prediction(text_list: list, case_type: str):
    if case_type == 'divorce':
        model_path = '/home/zf/lyy/Data/divorce/data/model/kedaV3/model_out'
        label_path = '/home/zf/lyy/Data/divorce/data/new_data'
    elif case_type == 'loan':
        model_path = '/home/zf/lyy/Data/loan/data/model/keda/model_out'
        label_path = '/home/zf/lyy/Data/loan/new_data'
    elif case_type == 'labor':
        model_path = '/home/zf/lyy/Data/labor/data/model/keda/model_out'
        label_path = '/home/zf/lyy/Data/labor/new_data'
    else:
        raise Exception('No this type')
    tag_dict = build_tags_dict(case_type)
    predictor = BertClassificationPredictor(model_path=model_path,
                                            label_path=label_path,
                                            multi_label=True,
                                            model_type='bert')
    output = predictor.predict_batch(text_list)

    detail_dic = {}
    result_dic = {}
    for i in range(len(output)):
        lab_list = []
        for key in output[i]:
            if float(key[1]) > 0.5:
                if key[0] not in result_dic.keys():
                    result_dic[key[0]] = 1
                else:
                    result_dic[key[0]] += 1
                if '21' not in key[0]:
                    lab_list.append(tag_dict[key[0]])
        detail_dic[text_list[i]] = lab_list
    final_result_dic = {}
    for key in result_dic.keys():
        if '21' not in key:
            final_result_dic[tag_dict[key]] = result_dic[key]
    return final_result_dic, detail_dic
예제 #8
0
def predict_bert(experiment_parameters):
    # create predictor object
    predictor = BertClassificationPredictor(model_path=(experiment_parameters.MODEL_PATH/'output/model_out').absolute().as_posix(),
                                        label_path=experiment_parameters.LABEL_PATH,
                                        multi_label=True,
                                        model_type=experiment_parameters.MODEL_TYPE,
                                        do_lower_case=True)

    # predict test labels
    output = predictor.predict_batch(list(pd.read_csv(experiment_parameters.DATA_PATH/'test.csv')['comment_text'].values))

    # dump results
    pd.DataFrame(output).to_csv(experiment_parameters.PRED_PATH/experiment_parameters.RESULTS_FILENAME)

    # clean output
    preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output])
    print(preds.head())

    # load test data
    df_test = pd.read_csv(experiment_parameters.DATA_PATH/'test.csv')
    print(df_test.head())

    # merge dataframes
    df_pred = pd.merge(df_test, preds, how='left', left_index=True, right_index=True)
    del df_pred['comment_text']

    #df_pred = df_pred['id', 'obscene']
    df_pred['ground_truth'] = df_pred['%s_x'%LABEL_COLS[0]]
    df_pred['pred_prob'] = df_pred['%s_y'%LABEL_COLS[0]]
    del df_pred['%s_x'%LABEL_COLS[0]]
    del df_pred['%s_y'%LABEL_COLS[0]]
    print(df_pred.head())

    # write results to file
    df_pred.to_csv(experiment_parameters.PRED_PATH/experiment_parameters.RESULTS_FILENAME, index=None)
    return
    args.multi_gpu = True
else:
    args.multi_gpu = False

label_cols = ["functionality", "range_anxiety", "availability", "cost", "ui", "location", "service_time", "dealership"]

databunch = BertDataBunch(args['data_dir'], LABEL_PATH, args.model_name, train_file='train_final.csv', val_file='valid_final.csv',
                          test_data='test_final.csv',
                          text_col="review", label_col=label_cols,
                          batch_size_per_gpu=args['train_batch_size'], max_seq_length=args['max_seq_length'], 
                          multi_gpu=args.multi_gpu, multi_label=True, model_type=args.model_type)

databunch.train_dl.dataset[0][3]
num_labels = len(databunch.labels)
print(num_labels)
metrics = []
metrics.append({'name': 'accuracy', 'function': accuracy})
metrics.append({'name': 'accuracy_thresh', 'function': accuracy_thresh})
metrics.append({'name': 'roc_auc', 'function': roc_auc})
metrics.append({'name': 'fbeta', 'function': fbeta})
print(device)
MODEL_PATH = '../models/output/model_out/'
LABEL_PATH = '.'
predictor = BertClassificationPredictor(
				model_path=MODEL_PATH,
				label_path=LABEL_PATH, # location for labels.csv file
				multi_label=True,
				model_type='xlnet',
				do_lower_case=False)
predictions = predictor.predict_batch(list(pd.read_csv('test_final.csv')['review'].values))
예제 #10
0
    parser.add_argument('--file_out', type=str, default='')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = get_args()
    print(args.file_in)


    OUTPUT_DIR = './output/'
    MODEL_PATH = OUTPUT_DIR + args.model_path+'/model_out'
    LABEL_PATH = './'

    predictor = BertClassificationPredictor(
        model_path=MODEL_PATH,
        label_path=LABEL_PATH,  # location for labels.csv file
        multi_label=False,
        model_type='bert',
        do_lower_case=False)

    df = pd.read_csv(LABEL_PATH + args.file_in, encoding='utf-8')
    texts = df.text.tolist()
    multiple_predictions = predictor.predict_batch(texts)
    # print(multiple_predictions)
    # print(type(multiple_predictions))

    with open('./predictions/'+args.file_out, 'w') as filehandle:
        for listitem in multiple_predictions:
            filehandle.write('%s\n' % listitem)
예제 #11
0
predictor = BertClassificationPredictor(
    model_path=MODEL_PATH,
    label_path=args.label_dir,  #sys.argv[2], # directory for labels.csv file
    multi_label=False,
    model_type='bert',
    do_lower_case=True)
INPUT = os.path.join('data', args.input_name + '.csv')
texts = list(csv.reader(open(INPUT, 'rt')))  # sys.argv[3]
batchsize = args.batch_size
multiple_predictions = []
for i in tqdm(range(1, len(texts), batchsize)):
    batch_texts = []
    if i + batchsize > len(texts):
        for j in range(i, len(texts)):
            batch_texts.append(texts[j][0])
        tmp_pred = predictor.predict_batch(batch_texts)
        multiple_predictions.extend(tmp_pred)
    else:
        for j in range(i, i + batchsize):
            batch_texts.append(texts[j][0])
        tmp_pred = predictor.predict_batch(batch_texts)
        multiple_predictions.extend(tmp_pred)
#multiple_predictions = predictor.predict_batch(i[0] for i in texts[1:])
if not os.path.exists(args.output_dir):
    os.system('mkdir ' + args.output_dir)
curr_time = datetime.datetime.now()
time_str = datetime.datetime.strftime(curr_time, '%Y-%m-%d_%H:%M:%S')
time_str = time_str + '-' + args.dataset
os.system('mkdir ' + os.path.join(args.output_dir, time_str))
os.system('cp data/*.csv data/*_idx ' +
          os.path.join(args.output_dir, time_str))
예제 #12
0
# 设定索要的标签
label_cols = (sentence_labels
              if train_for == 'sentence' else fine_grained_labels)

# 开始构建预测模型
predictor = BertClassificationPredictor(model_path=args.output_dir /
                                        'model_out',
                                        label_path=LABEL_PATH,
                                        multi_label=True,
                                        model_type=args.model_type,
                                        do_lower_case=True)

# 获取测试数据
output = predictor.predict_batch(
    list(
        pd.read_csv(str(
            DATA_PATH.joinpath('test.csv').absolute()))['text'].values))

# 将预测结果输出
pd.DataFrame(output).to_csv(
    str(DATA_PATH.joinpath('output_bert.csv').absolute()))

# 预测结果读入
results = pd.read_csv(str(DATA_PATH.joinpath('output_bert.csv').absolute()))

# 预测结果构成一个 pd 对象
preds = pd.DataFrame([{item[0]: item[1] for item in pred} for pred in output])
print(preds.head())

test_df = pd.read_csv(str(DATA_PATH.joinpath('test.csv').absolute()))
print(test_df.head())
예제 #13
0
from fast_bert.prediction import BertClassificationPredictor
import pandas as pd
import csv
import json
import copy

predictor = BertClassificationPredictor(
    model_path='./Data/labor/data/model/keda/model_out',
    label_path='./Data/labor/new_data',
    multi_label=True,
    model_type='bert')

text_list = list(pd.read_csv("./Data/labor/new_data/test.csv")['text'].values)
output = predictor.predict_batch(text_list)

print(output)
예제 #14
0
def main(model_uri: Param("S3 uri with NLP model", str),
         data_uri: Param("S3 uri with input csv file", str), result_uri: Param(
             "S3 uri where to put output csv file with added \
                                inference columns",
             str), inference_columns: Param(
                 "text columns separated in the csv file on \
                        which inference will be run", str)):
    try:
        local_model = download_uri(model_uri)
    except:
        print(f"Failed to download NLP model. Exiting...")
        sys.exit(2)

    try:
        local_csv = download_uri(data_uri)
    except:
        print(f"Failed to download input csv file. Exiting...")
        sys.exit(2)

    model_dir = Path("/tmp/model")
    model_dir.mkdir(exist_ok=True)

    out = subprocess.Popen(['tar', 'xzf', local_model, '-C', model_dir],
                           stdout=subprocess.PIPE,
                           stderr=subprocess.PIPE)

    stdout, stderr = out.communicate()
    if not stderr:
        print("Model extacted sucessfully")
    else:
        print(stderr.decode('ascii'))
        print(f"Model extaction error. Exiting...")
        sys.exit(1)

    model_config = model_dir / 'model_config.json'
    with open(model_config) as f:
        config = json.load(f)

    print("Loading model")

    predictor = BertClassificationPredictor(
        model_path=str(model_dir / 'model_out'),
        label_path=str(model_dir),  # location for labels.csv file
        model_type=config['model_type'],
        multi_label=config['multi_label'],
        do_lower_case=config['do_lower_case'],
    )
    try:
        print("Loading input csv")
        df = pd.read_csv(local_csv)
    except:
        print("Failed to load input csv file. Exiting...")
        sys.exit(1)

    inference_columns = inference_columns.split(',')
    for c in inference_columns:
        if c not in df.columns:
            print(f"{c} is not a column name in input csv file. Exiting...")
            sys.exit(2)

    for c in inference_columns:

        print(f"Starting inference for {c} column")

        start = time.time()

        text = df.loc[~df[c].isna(), c].tolist()

        out = predictor.predict_batch(text)
        result = pd.DataFrame(list(map(dict, out)))
        for r in result.columns:
            df.loc[~df[c].isna(), f"{c}_{r}"] = result[r].tolist()

        print(f"Inference time for {len(text)} rows was {time.time() - start}")

    df.to_csv(local_csv, index=False)

    upload_uri(local_csv, result_uri)

    print("We are done with inference!")