示例#1
0
class TransformerModel:
    """
    This class provides the Machine Learning model and classifies tenders based on previous training data.
    """
    def load_model(self):
        if not self.model:
            from simpletransformers.classification import ClassificationModel
            try:
                self.model = ClassificationModel('bert',
                                                 './outputs/',
                                                 use_cuda=False,
                                                 args=args)
            except Exception as ex:
                logger.error(
                    f"could not load model from /outputs due to {str(ex)}, creating new model"
                )
                self.create_new_model()

    def __init__(self):
        self.model = None

    def __convert_to_input(self, tenders):
        titles = list(map(lambda x: x.get_title("DE"), tenders))
        return titles

    def classify(self, tenders):
        self.load_model()

        titles = self.__convert_to_input(tenders)
        predictions, raw_output = self.model.predict(titles)
        tuples = zip(tenders, predictions)

        selected_tenders = [t for t, p in tuples if p == 1]
        return selected_tenders

    def train(self, labelled_tenders):
        self.load_model()

        tenders = [i for i, j in labelled_tenders]
        tenders = self.__convert_to_input(tenders)
        labels = [j for i, j in labelled_tenders]

        tenders_train, tenders_test, labels_train, labels_test = train_test_split(
            tenders, labels, test_size=0.1, random_state=42)

        data_input = pd.DataFrame(zip(tenders_train, labels_train))

        self.model.train_model(data_input)

        labels_pred, raw_output = self.model.predict(tenders_test)
        tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
        logger.info(f"tn: {tn} fp: {fp}")
        logger.info(f"fn: {fn} tp:{tp}")

    def create_new_model(self):
        from simpletransformers.classification import ClassificationModel
        self.model = ClassificationModel('bert',
                                         'bert-base-german-cased',
                                         use_cuda=False,
                                         args=args)
示例#2
0
def bert_predictions(tweet: pd.DataFrame, model: ClassificationModel):
    """
    Bert Inference for prediction.
    :param tweet: dataframe with tweets
    :param model: Bert Model
    :return: list of pr
    """
    tweet = tweet.values.tolist()
    try:
        predictions, raw_outputs = model.predict(tweet)
    except:
        for element in tweet.iteritems():
            model.predict([element])
        print("STOPP")
    auswertung = collections.Counter(predictions)
    gc.collect()

    # df = pd.DataFrame(raw_outputs)
    # df['predictions'] = pd.DataFrame(predictions)
    # df['tweets'] = pd.DataFrame(tweet)
    # df = df.replace(r'\n', ' ', regex=True)
    # df_softmax = pd.DataFrame(softmax(raw_outputs, axis=1))
    # df['softmax0'] = df_softmax[0]
    # df['softmax1'] = df_softmax[1]
    # db_functions.df_to_sql(df, 'temp_table', 'replace')

    return auswertung
def cross_pseudo_labeling(train, pseudo_test, test, params, n_folds,
                          model_name, model_type, lb_hack):
    splits = list(
        StratifiedKFold(n_splits=n_folds, shuffle=True,
                        random_state=1234).split(train["text"],
                                                 train["label"]))
    splits_test = list(
        KFold(n_splits=n_folds, shuffle=True,
              random_state=1234).split(test["jobflag"]))

    y_pred = np.zeros((test.shape[0], n_folds))
    oof = np.zeros(train.shape[0])
    oof_raw = np.zeros((train.shape[0], n_folds))
    weight = len(train) / train["label"].value_counts().sort_index().values

    f1_score = 0

    for fold, (train_idx, valid_idx) in enumerate(splits):
        X_train = pd.concat([train.iloc[train_idx], pseudo_test])
        X_valid = train.iloc[valid_idx]
        model = ClassificationModel(model_type=model_type,
                                    model_name=model_name,
                                    num_labels=4,
                                    args=params,
                                    use_cuda=True,
                                    weight=weight.tolist())

        model.train_model(X_train)

        result, model_outputs, wrong_predictions = model.eval_model(
            X_valid, f1=metric_f1)
        print(result)
        f1_score += result["f1"] / n_folds

        fold_pred, raw_outputs = model.predict(test["description"].values)
        # y_pred[:, fold] = hack(raw_outputs)
        y_pred[:, :] = raw_outputs / n_folds

        oof_pred, oof_outputs = model.predict(
            X_valid["text"].values)  # 謎のバグが発生するので変換
        oof[valid_idx] = oof_pred
        oof_raw[valid_idx, :] = oof_outputs
        # oof[valid_idx] = hack(oof_outputs)

    print(f"mean f1_score: {f1_score}")

    raw_pred = y_pred.copy()

    y_pred = hack(y_pred, lb_hack)

    # oof = hack(oof_raw)

    # y_pred = stats.mode(y_pred, axis=1)[0].flatten().astype(int)

    test_pred = pd.DataFrame(
        np.concatenate([y_pred.reshape(-1, 1), raw_pred], 1))
    oof_pred = pd.DataFrame(np.concatenate([oof.reshape(-1, 1), oof_raw], 1))

    return test_pred, f1_score, oof_pred
class TransformerModel(TenderClassClassifier):
    """
    This class provides the Machine Learning model and classifies tenders based on previous training data.
    """
    def __init__(self):
        self.model = None

    def load(self, name):
        self.model = ClassificationModel('bert',
                                         './outputs/',
                                         use_cuda=cuda_available,
                                         args=args)

    def save(self, name):
        pass

    def __convert_to_input(self, tenders):
        titles = list(map(lambda x: x.get_title("DE"), tenders))
        return titles

    def classify(self, tenders):
        titles = self.__convert_to_input(tenders)
        predictions, raw_output = self.model.predict(titles)
        tuples = zip(tenders, predictions)

        selected_tenders = [t for t, p in tuples if p == 1]
        return selected_tenders

    def train(self, labelled_tenders):
        tenders = [i for i, j in labelled_tenders]
        tenders = self.__convert_to_input(tenders)
        labels = [j for i, j in labelled_tenders]

        tenders_train, tenders_test, labels_train, labels_test = train_test_split(
            tenders, labels, test_size=0.1, random_state=42)

        data_input = pd.DataFrame(zip(tenders_train, labels_train))

        start = time.time()
        self.model.train_model(data_input)
        end = time.time()

        print(end - start)

        labels_pred, raw_output = self.model.predict(tenders_test)
        tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
        logger.info(f"tn: {tn} fp: {fp}")
        logger.info(f"fn: {fn} tp:{tp}")

        logger.info(
            f"Accuracy Score: {accuracy_score(labels_test, labels_pred)}")

    def create_new_model(self):
        from simpletransformers.classification import ClassificationModel
        self.model = ClassificationModel('bert',
                                         'bert-base-german-cased',
                                         use_cuda=cuda_available,
                                         args=args)
示例#5
0
def model(train, test, params, n_folds, model_name, model_type, lb_hack, prediction=False):
    kfold = StratifiedKFold(n_splits=n_folds)

    y_pred = np.zeros((test.shape[0], n_folds))
    oof = np.zeros(train.shape[0])
    oof_raw = np.zeros((train.shape[0], n_folds))
    weight = len(train) / train["label"].value_counts().sort_index().values

    f1_score = 0

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train["text"], train['label'])):
        args = params.copy()
        args["output_dir"] = params["output_dir"] + "_" + str(fold + 1)

        X_train = train.iloc[train_idx]
        X_valid = train.iloc[valid_idx]
        if prediction:
            model_name = args["output_dir"]

        model = ClassificationModel(model_type=model_type, model_name=model_name, num_labels=4,
                                    args=args, use_cuda=True, weight=weight.tolist())

        if not prediction:
            model.train_model(X_train)

        result, model_outputs, wrong_predictions = model.eval_model(X_valid, f1=metric_f1)
        print(result)
        f1_score += result["f1"] / n_folds

        fold_pred, raw_outputs = model.predict(test['description'])
        # y_pred[:, fold] = hack(raw_outputs)
        y_pred += raw_outputs / n_folds

        oof_pred, oof_outputs = model.predict(X_valid["text"].values)  # 謎のバグが発生するので
        oof[valid_idx] = oof_pred
        oof_raw[valid_idx, :] = oof_outputs
        # oof[valid_idx] = hack(oof_outputs)

    print(f"mean f1_score: {f1_score}")

    raw_pred = y_pred.copy()
    y_pred = hack(y_pred, lb_hack)

    # oof = hack(oof_raw)

    # y_pred = stats.mode(y_pred, axis=1)[0].flatten().astype(int)

    test_pred = pd.DataFrame(np.concatenate([y_pred.reshape(-1, 1), raw_pred], 1))
    oof_pred = pd.DataFrame(np.concatenate([oof.reshape(-1, 1), oof_raw], 1))

    return test_pred, f1_score, oof_pred
示例#6
0
def predict_export(data):

    X = data[args.predict_partition]['text']
    predictions = {}

    for class_name in ['arousal', 'valence', 'topic']:  #

        if class_name in ['arousal', 'valence']:
            class_no = 3
        else:
            class_no = 10

        trained_model_path = os.path.join('experiments/best_model/',
                                          class_name + str(False))
        model = ClassificationModel(args.model_type,
                                    trained_model_path,
                                    num_labels=class_no)
        predictions['prediction_' + class_name], _ = model.predict(X)

    predictions['id'] = data[args.predict_partition]['id']
    predictions['segment_id'] = data[args.predict_partition]['segment_id']

    df = pd.DataFrame.from_dict(predictions)  # , orient='index' .T
    header_names = [
        'id', 'segment_id', 'prediction_arousal', 'prediction_valence',
        'prediction_topic'
    ]
    df[header_names].to_csv(output_path + args.predict_partition + '.csv',
                            header=header_names,
                            index=False)
示例#7
0
class EmpathyClassifier():
    def __init__(self,
                 use_cuda=torch.cuda.is_available(),
                 cuda_device=0,
                 batch_size=16):
        self.model_type = "empathy"
        train_args["eval_batch_size"] = batch_size

        model_path = os.path.join(os.path.dirname(__file__), "models/empathy/")
        model_file = os.path.join(os.path.dirname(__file__),
                                  "models/empathy.tar.gz")
        if not os.path.isdir(model_path):
            model = f'{self.model_type}_model'
            if not os.path.isfile(model_file):
                logger.info(
                    f'Model {self.model_type} does not exist at {model_path}. Attempting to download it.'
                )
                fetch_pretrained_model(model, model_file)
            unzip_simple_transformer_model(model, model_path, model_file)

        # Create a ClassificationModel
        self.model = ClassificationModel('roberta',
                                         model_path,
                                         num_labels=1,
                                         use_cuda=use_cuda,
                                         cuda_device=cuda_device,
                                         args=train_args)

    def predict(self, text):
        if isinstance(text[0], str):
            text = [text]
        predictions, raw_outputs = self.model.predict(text)
        return raw_outputs
def eval_stance_clf(model_path, src_path, gen_path, **kwargs):
    src = open(src_path, 'r').readlines()
    gen = open(gen_path, 'r').readlines()
    gen = [i.strip() for i in gen]
    src = [i.strip() for i in src]

    train_args = {
        'learning_rate': 3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': False,
        'process_count': 10,
        'train_batch_size': 4,
        'eval_batch_size': 400,
        'max_seq_length': 300,
        "fp16": False
    }

    model = ClassificationModel('roberta',
                                model_path,
                                num_labels=4,
                                use_cuda=True,
                                cuda_device=0,
                                args=train_args)

    input = [[i, j] for i, j in zip(src, gen)]
    predictions, raw_outputs = model.predict(input)
    th = Counter(predictions)
    th = sorted(th.items(), key=lambda x: x[0])
    print(th)
示例#9
0
def generate_prob_matrix(arguments):
	my_args = {
		"max_seq_length": 256,
		"train_batch_size": 16,
		"eval_batch_size": 16,
		"do_lower_case": True,
		"manual_seed": 17
	}

	model = ClassificationModel('bert', "relation_processing/model/bert", use_cuda=False, args=my_args)
	num_arguments = len(arguments)
	prob_matrix = np.zeros((num_arguments, num_arguments))
	for rel_from in range(1, num_arguments):
		for rel_to in arguments[rel_from].compare_list:
			if rel_from == rel_to:
				continue
			logging.info("calculating: " + str(rel_from) + "-->" + str(rel_to))
			
			timer = datetime.now()
			predictions, raw_outputs = model.predict([[arguments[rel_to].sentence, arguments[rel_from].sentence]])
			rel = softmax(raw_outputs, axis=1)
			Stats.h_bert_time += datetime.now() - timer
			Stats.h_bert += 1
			
			logging.debug(rel)
			prob_matrix[rel_to][rel_from] = rel[0][1]
	return prob_matrix
示例#10
0
def main(path, valid_in_cat_path, valid_out_of_cat_path):
    steam_df = load_steam_data()
    i = 1
    print("starting training, using fold " + str(i))

    train, test = load_fold_data(path, i)
    # Train the model using roberta model
    args_dict = {'output_dir': '../../models/roberta-base-bs8-e6-fold' + str(i),
                 'use_cached_eval_features': False,
                 'reprocess_input_data': True,
                 'train_batch_size': 8,
                 'num_train_epochs': 6,
                 'fp16': False,
                 'overwrite_output_dir': True}
    model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=args_dict)
    model.train_model(train)
    print("done training model fold " + str(i))
    result, model_outputs, wrong_predictions = model.eval_model(test, acc=accuracy_score, f1=f1_score)
    acc = result['acc']
    f1 = result['f1']
    print(f"acc: {acc} , f1: {f1}")

    # Make predictions with the model
    save_path = '../../reports/steam-prediction.csv'
    print("predicting...")
    predictions, raw_outputs = model.predict(steam_df["sentence"].tolist())
    print(f"predicting finished - saved to {save_path}" )
    steam_df['prediction'] = predictions
    steam_df.to_csv(save_path, index=False)
示例#11
0
class XLNetDeBERTaClassification:
    def __init__(self, model, model_dir="D:/Language Models/"):
        model_pn = {
            'xlnet': f'{model_dir}{"XLNET-LARGE/"}',
            'deberta': f'{model_dir}{"DEBERTA-LARGE/"}'
        }[model]
        self.model = ClassificationModel(model, model_pn, use_cuda=False)
        self.tokenizer = XLNetTokenizer.from_pretrained(model_pn) if model == 'xlnet' \
            else DebertaTokenizer.from_pretrained(model_pn)
        self._text = None

    @property
    def text(self):
        return self._text

    @text.setter
    def text(self, raw_text):
        if not isinstance(raw_text, str):
            raise TypeError("Error: Invalid Input Type")
        else:
            self._text = raw_text

    def check_probs(self):
        _, logits = self.model.predict([self._text])
        probs = softmax(logits[0])
        probs_3dp = ["{:.5f}".format(float(i)) for i in probs]
        return probs_3dp, len(
            self.tokenizer.encode(self._text, add_special_tokens=False))
示例#12
0
def main():
    # load train & test data
    df_train = pd.read_csv("sentiment_train.csv")
    df_test = pd.read_csv("sentiment_test.csv")

    #set random seed
    random = 42

    # Train test split
    X_train, X_val, y_train, y_val = train_test_split(df_train['Sentence'],
                                                      df_train['Polarity'],
                                                      test_size=0.10,
                                                      random_state=random)
    train_dataset = pd.concat([X_train, y_train], axis=1)
    val_dataset = pd.concat([X_val, y_val], axis=1)

    # Load a pre-trained model, and train it with our data | See all models available: https://huggingface.co/transformers/pretrained_models.html
    # Create model ... args = parameters
    args = {
        'reprocess_input_data': True,
        'max_seq_length': 300,
        'num_train_epochs': 1,
        'fp16': False,
        'train_batch_size': 4,
        'overwrite_output_dir': True
    }
    my_model = ClassificationModel('roberta',
                                   'distilroberta-base',
                                   num_labels=2,
                                   use_cuda=True,
                                   cuda_device=0,
                                   args=args)
    # Train the model
    my_model.train_model(train_dataset)

    # Evaluate the model
    result, model_outputs, wrong_predictions = my_model.eval_model(
        val_dataset, acc=f1_score)
    pred_val = np.argmax(model_outputs, axis=1).tolist()

    print("Results on evaluation:")
    print("----------------------")
    print("F1 Score = {:.6f}\n".format(
        f1_score(y_val, pred_val, average='micro') * 100))

    print(classification_report(y_val, pred_val))
    print(confusion_matrix(y_val, pred_val))

    # get results on test set
    pred_test, _ = my_model.predict(df_test['Sentence'])

    # print f1 score
    print(f1_score(df_test.Polarity, pred_test))

    # print accuracy score
    print(accuracy_score(df_test.Polarity, pred_test))

    # save input/ground truth/prediction as one csv
    df_test['prediction'] = pred_test
    df_test.to_csv('q3_ans.csv', index=False)
示例#13
0
def predict_df(
    data_pkl,
    model_type,
    model_name,
):
    """
    Apply a fine-tuned regression model to generate predictions.
    The text is given in `data_pkl` and the predictions are generated per row and saved in a 'predictions' column.

    Parameters
    ----------
    data_pkl: str
        path to pickled df with the data, which must contain the column 'text'
    model_type: str
        type of the pre-trained model, e.g. bert, roberta, electra
    model_name: str
        path to a directory containing model file

    Returns
    -------
    None
    """

    # load data
    df = pd.read_pickle(data_pkl)

    # check CUDA
    cuda_available = torch.cuda.is_available()
    if not cuda_available:

        def custom_formatwarning(msg, *args, **kwargs):
            return str(msg) + '\n'

        warnings.formatwarning = custom_formatwarning
        warnings.warn('CUDA device not available; running on a CPU!')

    # load model
    model = ClassificationModel(
        model_type,
        model_name,
        num_labels=1,
        use_cuda=cuda_available,
    )

    # predict
    print("Generating predictions. This might take a while...")
    txt = df['text'].to_list()
    predictions, _ = model.predict(txt)

    col = f"pred_{Path(model_name).stem}"
    df[col] = predictions

    # pkl df
    df.to_pickle(data_pkl)
    print(
        f"A column with predictions was added.\nThe updated df is saved: {data_pkl}"
    )
示例#14
0
def main():
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    train_txt_path = "train.txt"
    test_txt_path = "test.txt"
    result_txt_path = "result.txt"

    relations = [
        "Cause-Effect", "Component-Whole", "Entity-Destination",
        "Product-Producer", "Entity-Origin", "Member-Collection",
        "Message-Topic", "Content-Container", "Instrument-Agency", "Other"
    ]

    file_train = open(train_txt_path)
    train_data = []
    for i in range(6400):
        temp = []
        temp.append(file_train.readline().split('"')[1].strip('"').strip('.'))
        temp.append(relations.index(file_train.readline().split('(')[0]))
        train_data.append(temp)
    #print(train_data)

    train_df = pd.DataFrame(train_data)
    train_df.columns = ["text", "labels"]
    '''
    model_args = ClassificationArgs(num_train_epochs=1)
    model = ClassificationModel(
        'bert',
        'bert-base-cased',
        num_labels=10,
        args=model_args,
        use_cuda=False
    )

    # Train the model
    model.train_model(train_df, output_dir='./model')
    '''
    model = ClassificationModel("bert",
                                "model/checkpoint-800-epoch-1",
                                use_cuda=False)

    file_test = open(test_txt_path)
    test_sentences = []
    for i in range(1600):
        test_sentences.append(
            file_test.readline().split('"')[1].strip('"').strip('.'))

    test_result, raw_result = model.predict(test_sentences)
    print(test_result)
    file_result = open(result_txt_path, 'w+')
    for i in range(1600):
        file_result.write(relations[test_result[i]] + '\n')
    file_train.close()
    file_test.close()
    file_result.close()
示例#15
0
class Classifier:
    def __init__(self, model_type, model_name, use_cuda=True):
        logging.basicConfig(level=logging.INFO)
        transformers_logger = logging.getLogger("transformers")
        transformers_logger.setLevel(logging.WARNING)

        # Create a ClassificationModel
        self.model_type = model_type
        self.model_name = model_name
        self.use_cuda = use_cuda
        self.dat = {}
        self.rerun = False

    def add(self, X, Y):
        self.dat[Y] = X

    def train(self, split=0.7, num_epochs=10):
        self.le = preprocessing.LabelEncoder()
        print(list(self.dat.keys()))
        self.le.fit(list(self.dat.keys()))

        train_data = []
        eval_data = []
        for k, v in self.dat.items():
            len_train = int(round(len(v) * split))
            train_data.extend([[i, self.le.transform([k])[0]]
                               for i in v[:len_train]])

            eval_data.extend([[i, self.le.transform([k])[0]]
                              for i in v[len_train:]])

        print(train_data, eval_data)
        train_df = pd.DataFrame(train_data)
        eval_df = pd.DataFrame(eval_data)
        train_args = {
            'overwrite_output_dir': True,
            'num_train_epochs': num_epochs,
        }
        self.model = ClassificationModel(self.model_type,
                                         self.model_name,
                                         num_labels=len(list(self.dat.keys())),
                                         use_cuda=self.use_cuda,
                                         cuda_device=0,
                                         args=train_args)
        # Train the model
        self.model.train_model(train_df, eval_df=eval_df)

        # Evaluate the model
        result, model_outputs, wrong_predictions = self.model.eval_model(
            eval_df, acc=sklearn.metrics.accuracy_score)

    def predict(self, x):
        predictions, raw_outputs = self.model.predict(x)
        return self.le.inverse_transform(predictions)
示例#16
0
def predict(text):
    model = ClassificationModel("distilbert",
                                './models/DistilBERT/',
                                use_cuda=False,
                                num_labels=11)

    prediction, raw_outputs = model.predict([text])
    top_3 = raw_outputs.argsort()[0][-3:]
    confidence = softmax(raw_outputs[0])
    index = confidence.argsort()[-3:]
    return [flairs[prediction[0]], flairs[top_3[1]], flairs[top_3[0]]], [
        confidence[index[-1]], confidence[index[-2]], confidence[index[-3]]
    ]
    return flairs[prediction[0]]
class PretrainedBert():
    def __init__(self):
        # CHANGE CUDA WHEN NEEDED
        # experiment w these args
        # change bert to other transformer
        # change pretrain to other pretrain
        self.model = ClassificationModel('roberta',
                                         'roberta-base',
                                         use_cuda=False,
                                         num_labels=5,
                                         args={
                                             'max_seq_length': 128,
                                             'save_steps': 10000,
                                             'fp16': False,
                                             'logging_steps': 1,
                                             'train_batch_size': 16,
                                             'num_train_epochs': 1
                                         })

    def run(self, train_df):
        self.model.train_model(train_df)

    def eval(self, eval_df):
        return self.model.predict(eval_df)

    def load(self, path):
        self.model = ClassificationModel('roberta',
                                         path,
                                         num_labels=5,
                                         use_cuda=False,
                                         args={
                                             'max_seq_length': 128,
                                             'save_steps': 10000,
                                             'fp16': False,
                                             'logging_steps': 1,
                                             'train_batch_size': 16,
                                             'num_train_epochs': 1
                                         })

    def test_challenge_set(self, labels_pred, labels_act):
        mae = 0.0
        acc = 0.0
        for i in range(len(labels_act)):
            mae += abs(labels_pred[i] - labels_act[i])
            if labels_pred[i] == labels_act[i]:
                acc += 1
        mae = mae / len(labels_act)
        acc = acc / len(labels_act)
        return mae, acc
示例#18
0
def use_model(model_type, model_path):
    model = ClassificationModel(model_type, model_path)

    comments = [
        "Good hotel, nothing great but is enough to have a decent experience and fun with the family",
        "Horrible experience, the staff was rude and the food terrible",
        "I truly loved it! Great staff and delicious food. 5/5",
        "nice boutique hotel stayed 5 nights, rooms nice clean place pretty, location good in central singapore",
        "Wonderful place perfectly located. Just opened a month ago. Very helpful staff, reasonable rates, breakfast included. The rooms are nice and completely clean.",
        "The pricing was fine, good location. Food could be better."
    ]

    predictions, raw_outputs = model.predict(comments)
    print('Prediction:', predictions)
    print('Raw Outputs:', raw_outputs)
示例#19
0
def binatron(request):
    try:
        inport = request.data
        inport = [inport["eligible_text"]]
        # inport = list(inport.values())
        model = ClassificationModel("bert", "outputs", use_cuda=False)
        predictions, raw_outputs = model.predict(inport)
        df = pd.DataFrame(predictions, columns=['Criteria'])
        df = df.replace({1: 'Eligible', 0: 'Others'})
        df = df.to_dict()['Criteria'].get(0)
        return JsonResponse('Text Belongs to Class {}'.format(df), safe=False)
        #
        # return df
    except ValueError as e:
        return Response(e.args[0], status.HTTP_400_BAD_REQUEST)
def binary(train, eval):
    train_df = pd.read_csv(train, skip_blank_lines=True)
    eval_df = pd.read_csv(eval, skip_blank_lines=True)
    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                "roberta-large",
                                num_labels=2,
                                args=model_args)
    # Train the model
    model.train_model(train_df, eval_df)
    # evaluate the model
    preds, outputs = model.predict(eval_df.text.tolist())
    targets = eval_df.labels.tolist()
    print("accuracy", accuracy_score(targets, preds))
    print("f1 score", f1_score(targets, preds))
示例#21
0
def train_model(model_type, model_name):
    print('Starting run:', model_type, model_name)

    train_df = pd.read_csv("data/reviews/train.csv", header=None)
    train_df.columns = ["text", "labels"]

    eval_df = pd.read_csv("data/reviews/test.csv", header=None)
    eval_df.columns = ["text", "labels"]

    # print(eval_df.iloc[0])

    t0 = time.time()
    train_args = {
        'output_dir': f'model-outputs/reviews/{model_name}-outputs',
        'max_seq_length': 256,
        'num_train_epochs': 5,
        'train_batch_size': 16,
        'eval_batch_size': 32,
        'learning_rate': 5e-5,
        'evaluate_during_training': True,
        'evaluate_during_training_steps': 50000,
        'save_model_every_epoch': False,
        'overwrite_output_dir': True,
        'no_cache': True,
        'use_early_stopping': True,
        'early_stopping_patience': 3,
        'manual_seed': 4,
        'regression': True,
        'best_model_dir': f'outputs/{model_type}/best_model'
    }

    model = ClassificationModel(model_type,
                                model_name,
                                num_labels=1,
                                args=train_args)
    model.train_model(train_df, eval_df=eval_df)
    print('Run finished')
    t1 = time.time()
    total = t1 - t0

    print('Time:', total)
    print('--------------------')

    predictions, raw_outputs = model.predict([
        "Good hotel, nothing great but is enough to have a decent experience and fun with the family"
    ])
    print('Prediction:', predictions)
    print('Raw Outputs:', raw_outputs)
def model_predict(text):
    '''
    Takes in an array of text and returns predicted probability of risk.

    Input:
        text (arr): E.g. data[['content']]
    Output: 
        pred (arr): returns label of 0 for low risk and 1 for high risk based on prob_risk
        prob_risk (arr): E.g. data['probability_risk'] = model_predict(data[['content']])
        pred_risk (arr): Risk score for each article
    '''

    #read text file to get model path
    model_txt = open("../automation/curr_model.txt", "r")
    model_path = model_txt.read()
    model_txt.close()

    # loading saved model, specifying same args as model init
    # model names: path to directory containing model files
    # model naming convention : roberta_YYYY_MM
    model_args = ClassificationArgs(num_train_epochs=2, learning_rate=5e-5)
    model = ClassificationModel(model_type = 'roberta', model_name = model_path, \
                                args = model_args, use_cuda = False)

    # Preprocess text
    processed_text = text.apply(
        lambda x: text_processing(x,
                                  lower=False,
                                  remove_url=True,
                                  remove_punctuation=False,
                                  remove_stopwords=False,
                                  replace_entity=True,
                                  replace_hash=True,
                                  split_alphanumeric=False,
                                  lemmatize=False,
                                  stem=False))

    # predict
    pred, raw_outputs = model.predict(text)

    # convert to probability of risk
    prob = softmax(raw_outputs, axis=1)
    prob_risk = [x[1] for x in prob]
    pred_risk = [predicted_risk(x) for x in prob_risk]

    return pred, prob_risk, pred_risk
示例#23
0
class CamembertModel(BaseEstimator):
    def __init__(self):
        self.le = preprocessing.LabelEncoder()
        self.le.fit(labels)
        self.tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
        self.model = ClassificationModel('camembert', 'camembert-base', num_labels=8)

    def fit(self, X_train, y_train):
        df_train = pd.DataFrame({'text': [pattern.sub(' ', u[1]) for u in X_train], 'labels': self.le.transform(y_train)})
        res = self.model.train_model(df_train, tokenizer=self.tokenizer, output_dir="./model/camembert",
                                     show_running_loss=True, args=args)
        return res

    def predict_proba(self, X_test):
        test_texts = [pattern.sub(' ', u[1]) for u in X_test]
        _, raw_outputs = self.model.predict(test_texts)
        return [softmax(u) for u in raw_outputs]
示例#24
0
 def predict(self, test_file, model_type, model_dir, use_cuda=False):
     self.train_args["output_dir"] = out_dir
     test_df = self.read_df(test_file)
     model = ClassificationModel(
         model_type,
         f"{model_dir}/",
         num_labels=4,
         args=self.train_args,
         use_cuda=use_cuda,
     )
     predictions, raw_outputs = model.predict(test_df['text'].to_list())
     test_df['predictions'] = predictions
     test_predictions = f"{test_file[:-4]}_predictions_{model_type}.csv"
     write_csv(test_predictions, list(test_df.columns.values), [
         test_df[column].to_list()
         for column in list(test_df.columns.values)
     ])
def all_train(train, test, params, model_name, model_type, lb_hack):
    weight = len(train) / train["label"].value_counts().sort_index().values

    model = ClassificationModel(model_type=model_type,
                                model_name=model_name,
                                num_labels=4,
                                args=params,
                                use_cuda=True,
                                weight=weight.tolist())
    model.train_model(train)

    pred, raw_outputs = model.predict(test["description"])

    y_pred = hack(raw_outputs, lb_hack)

    pseudo_idx = (pd.DataFrame(raw_outputs).max(axis=1) > 3)

    return y_pred, pseudo_idx
def run_model(model_name, model_type, params):
    print('model_name:', model_name)
    print('model_type:', model_type)
    lang_dict = {'en': 'English', 'de': 'German',
                 'es': 'Spanish', 'fr': 'French'}
    pprint.pprint(params)
    for i in range(4):
        lang = languages[i]
        params['output_dir'] = 'models/' + model_type + '-' + lang + '/'
        # modelを書き出すのでそこで推論と分けられます。
        print('train', lang_dict[lang])
        train_path = _path(lang, 'train')
        train_data = pd.read_csv(train_path)
        train_data['jobflag'] -= 1
        train, val = train_test_split(
            train_data, test_size=0.3, random_state=i)

        # ↑の切り方は完全にミスで、別コードを再利用してる時に間違えました。。。。。ひどい。。。。

        model = ClassificationModel(model_name, model_type, num_labels=4, weight=[
                                    1.17, 2.10, 0.532, 1.25], args=params, use_cuda=False)

        # weightはsklearn.utils.class_weight.compute_class_weightを使って計算しました

        model.train_model(train)
        losses, model_outputs, _ = model.eval_model(
            val, f1=metric_f1)
        print('example :')
        pprint.pprint(np.argmax(model_outputs, axis=1)[:10] + 1)
        # 1クラスに引っ張られがちだったので確認用です
        # ax = sns.countplot(np.argmax(model_outputs, axis=1) + 1)
        # fig = ax.get_figure()
        # fig.savefig(lang + '.png')
        # 分布確認用.
        print('loss:')
        pprint.pprint(losses)
        for lang2 in languages:
            print('predict', lang_dict[lang2])
            test_path = _path(lang2, 'test')
            test = pd.read_csv(test_path)[:2]
            y_pred, _ = model.predict(test['description'])
            result = pd.DataFrame({'jobflag': y_pred + 1})
            result.to_csv('result/' + model_name +
                          '_' + lang + '_' + lang2 + '.csv', index=False, header=False)
示例#27
0
def main(argv):
    wandb.login()

    tasks, (train_df, valid_df,
            test_df), transformers = load_molnet_dataset(FLAGS.molnet_dataset,
                                                         tasks_wanted=None)

    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)

    model = ClassificationModel(FLAGS.model_type,
                                FLAGS.model_name,
                                args={
                                    'evaluate_each_epoch': True,
                                    'evaluate_during_training_verbose': True,
                                    'no_save': True,
                                    'num_train_epochs': FLAGS.num_train_epochs,
                                    'auto_weights': True
                                })
    # You can set class weights by using the optional weight argument

    # check if our train and evaluation dataframes are setup properly. There should only be two columns for the SMILES string and its corresponding label.
    print("Train Dataset: {}".format(train_df.shape))
    print("Eval Dataset: {}".format(valid_df.shape))
    print("TEST Dataset: {}".format(test_df.shape))

    model.train_model(train_df,
                      eval_df=valid_df,
                      output_dir=FLAGS.output_dir,
                      args={'wandb_project': 'project-name'})

    # accuracy
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df, acc=sklearn.metrics.accuracy_score)

    # ROC-PRC
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df, acc=sklearn.metrics.average_precision_score)

    # Lets input a molecule with a toxicity value of 1
    predictions, raw_outputs = model.predict(['C1=C(C(=O)NC(=O)N1)F'])
    print(predictions)
    print(raw_outputs)
示例#28
0
def train(
    arch,
    model_name,
):
    model_args = ClassificationArgs(
        num_train_epochs=5,
        output_dir="./models",
        evaluate_during_training_steps=1000,
        train_batch_size=64,
        reprocess_input_data=True,
        evaluate_during_training=True,
        eval_batch_size=32,
        save_model_every_epoch=False,
        overwrite_output_dir=True,
        learning_rate=7e-5,
        save_eval_checkpoints=False,
        best_model_dir=f"./models/{model_name}/best_model",
        use_early_stopping=True,
        early_stopping_delta=1e-2,
        early_stopping_metric="mcc",
        tensorboard_dir='./runs/',
        early_stopping_metric_minimize=False,
        wandb_project='my_roberta',
        manual_seed=69,
        early_stopping_patience=5,
    )
    model = ClassificationModel(arch,
                                model_name,
                                args=model_args,
                                use_cuda=True)

    model.train_model(
        train_df,
        eval_df=test,
        accuracy=lambda x, y: accuracy_score(x, [round(a) for a in y]),
    )

    result, model_output, top_loss = model.eval_model(test)
    print(result)
    print(top_loss)

    pred, _ = model.predict(["thanks for bearing with us"])
    print(pred)
示例#29
0
def test_multiclass_classification(model_type, model_name):
    # Train and Evaluation data needs to be in a Pandas Dataframe containing at
    # least two columns. If the Dataframe has a header, it should contain a 'text'
    # and a 'labels' column. If no header is present, the Dataframe should
    # contain at least two columns, with the first column is the text with
    # type str, and the second column in the label with type int.
    train_data = [
        ["Example sentence belonging to class 1", 1],
        ["Example sentence belonging to class 0", 0],
        ["Example eval senntence belonging to class 2", 2],
    ]
    train_df = pd.DataFrame(train_data, columns=["text", "labels"])

    eval_data = [
        ["Example eval sentence belonging to class 1", 1],
        ["Example eval sentence belonging to class 0", 0],
        ["Example eval senntence belonging to class 2", 2],
    ]
    eval_df = pd.DataFrame(eval_data, columns=["text", "labels"])

    # Create a ClassificationModel
    model = ClassificationModel(
        model_type,
        model_name,
        num_labels=3,
        args={
            "no_save": True,
            "reprocess_input_data": True,
            "overwrite_output_dir": True,
            "max_seq_length": 20,
        },
        use_cuda=False,
    )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)

    predictions, raw_outputs = model.predict(["Some arbitary sentence"])
示例#30
0
def predict_sarcasm(data_path, results, model_loc, model):
    # Bringing in the test data
    with open(data_path, 'r') as json_file:
        json_list = list(json_file)

    for json_str in json_list:
        pred.append(json.loads(json_str))

    pred_response = [
        remove_stopwords(convert_emojis(pred[i]['response']))
        for i in range(len(pred))
    ]
    pred_id = [pred[i]['id'] for i in range(len(pred))]

    model = ClassificationModel(model, model_loc, use_cuda=False)

    predictions, raw_outputs = model.predict(pred_response)

    pred_bert = pd.DataFrame({'id': pred_id, 'label': predictions})

    pred_bert['label'] = pred_bert['label'].replace([1, 0],
                                                    ['SARCASM', 'NOT_SARCASM'])
    pd.DataFrame(pred_bert).to_csv(results, header=False, sep=',', index=False)