Exemplo n.º 1
0
def main(path, valid_in_cat_path, valid_out_of_cat_path):
    steam_df = load_steam_data()
    i = 1
    print("starting training, using fold " + str(i))

    train, test = load_fold_data(path, i)
    # Train the model using roberta model
    args_dict = {'output_dir': '../../models/roberta-base-bs8-e6-fold' + str(i),
                 'use_cached_eval_features': False,
                 'reprocess_input_data': True,
                 'train_batch_size': 8,
                 'num_train_epochs': 6,
                 'fp16': False,
                 'overwrite_output_dir': True}
    model = ClassificationModel('roberta', 'roberta-base', num_labels=2, args=args_dict)
    model.train_model(train)
    print("done training model fold " + str(i))
    result, model_outputs, wrong_predictions = model.eval_model(test, acc=accuracy_score, f1=f1_score)
    acc = result['acc']
    f1 = result['f1']
    print(f"acc: {acc} , f1: {f1}")

    # Make predictions with the model
    save_path = '../../reports/steam-prediction.csv'
    print("predicting...")
    predictions, raw_outputs = model.predict(steam_df["sentence"].tolist())
    print(f"predicting finished - saved to {save_path}" )
    steam_df['prediction'] = predictions
    steam_df.to_csv(save_path, index=False)
Exemplo n.º 2
0
def fake_classify(train_set, eval_set, test_set, seed):

    # Create a TransformerModel

    model = ClassificationModel('bert',
                                'bert-base-multilingual-uncased',
                                args={
                                    'max_seq_length': 512,
                                    'num_train_epochs': 3,
                                    'overwrite_output_dir': True,
                                    'manual_seed': seed
                                },
                                use_cuda=True)
    print(model.args)

    # Train the model
    model.train_model(train_set)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_set,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score)
    #print('Evaluation results = ', results(results))

    return result, model_outputs, wrong_predictions
Exemplo n.º 3
0
    def train(self, train_data: object, eval_data: object) -> object:
        """
        Create and train the chosen model based on the args

        Parameters
        ----------
        train_data : object
            train split of the train_data.
        eval_data : object
            validation split of the train_data.

        Returns
        -------
        object
            model.

        """

        # Create a ClassificationModel
        model = ClassificationModel(
            self.model_name,
            self.model_type,
            args=self.model_args,
            use_cuda=self.cuda,
            num_labels=len(self.labels) - 1,
        )
        # Train the model
        model.train_model(train_df=train_data,
                          eval_df=eval_data,
                          accuracy=accuracy_score)
        return model
Exemplo n.º 4
0
def train(train_df, max_sub_len, output_dir):
    model_type = 'distilbert'
    lr = 2e-5
    sent_length = max_sub_len
    OUTPUT_DIR = output_dir \
                 + str(datetime.datetime.now())[:19] + '_' + model_type + '_' + str(sent_length) + '_' + str(lr)
    print("model is saved at: {}".format(OUTPUT_DIR))
    training_config = {
        'output_dir': OUTPUT_DIR,
        'reprocess_input_data': True,
        'overwrite_output_dir': True,
        'num_train_epochs': 2,
        'train_batch_size': 32,
        'eval_batch_size': 32,
        'learning_rate': lr,
        'max_seq_length': sent_length
    }
    logging.basicConfig(level=logging.INFO)
    transformers_logger = logging.getLogger("transformers")
    transformers_logger.setLevel(logging.WARNING)
    model = ClassificationModel(model_type,
                                'distilbert-base-cased',
                                num_labels=4,
                                args=training_config)
    torch.cuda.empty_cache()
    model.train_model(train_df)
    return model
Exemplo n.º 5
0
def train_model(train_df, num_labels):
    model_type, model_name = MODELNAME.split(";")
    model_output = 'models/{}-{}-{}'.format(TAG, model_type, model_name.replace("/", "-"))
    if OVERWRITE is False and os.path.exists(model_output):
        logging.info("Skipping training of {}".format(model_name))
        sys.exit(0)
    logging.info("Starting training of {}".format(model_name))
    run = wandb.init(project=model_output.split("/")[-1], reinit=True)

    model = ClassificationModel(
        model_type, model_name, num_labels=num_labels, args={
            'output_dir': model_output,
            'overwrite_output_dir': OVERWRITE,
            'best_model_dir': '{}/best'.format(model_output),
            'evaluate_during_training': False,
            'manual_seed': 42,
            'num_train_epochs': 4,
            # 'learning_rate': 2e-5,  # For BERT, 5e-5, 3e-5, 2e-5
            # For BERT 16, 32. It could be 128, but with gradient_acc_steps set to 2 is equivalent
            'train_batch_size': 8 if "large" in model_name else 32,
            'eval_batch_size': 8 if "large" in model_name else 32,
            # Doubles train_batch_size, but gradients and weights are calculated once every 2 steps
            'gradient_accumulation_steps': 2 if "large" in model_name else 1,
            'max_seq_length': 256,
            'sliding_window': False,
            'wandb_project': model_output.split("/")[-1],
            # "adam_epsilon": 3e-5,  # 1e-8
            "silent": False,
            "fp16": False,  # By default it uses 32 bit floating point
            "n_gpu": 1,
    })
    # train the model
    model.train_model(train_df)
    return model, run
Exemplo n.º 6
0
class TransformerModel:
    """
    This class provides the Machine Learning model and classifies tenders based on previous training data.
    """
    def load_model(self):
        if not self.model:
            from simpletransformers.classification import ClassificationModel
            try:
                self.model = ClassificationModel('bert',
                                                 './outputs/',
                                                 use_cuda=False,
                                                 args=args)
            except Exception as ex:
                logger.error(
                    f"could not load model from /outputs due to {str(ex)}, creating new model"
                )
                self.create_new_model()

    def __init__(self):
        self.model = None

    def __convert_to_input(self, tenders):
        titles = list(map(lambda x: x.get_title("DE"), tenders))
        return titles

    def classify(self, tenders):
        self.load_model()

        titles = self.__convert_to_input(tenders)
        predictions, raw_output = self.model.predict(titles)
        tuples = zip(tenders, predictions)

        selected_tenders = [t for t, p in tuples if p == 1]
        return selected_tenders

    def train(self, labelled_tenders):
        self.load_model()

        tenders = [i for i, j in labelled_tenders]
        tenders = self.__convert_to_input(tenders)
        labels = [j for i, j in labelled_tenders]

        tenders_train, tenders_test, labels_train, labels_test = train_test_split(
            tenders, labels, test_size=0.1, random_state=42)

        data_input = pd.DataFrame(zip(tenders_train, labels_train))

        self.model.train_model(data_input)

        labels_pred, raw_output = self.model.predict(tenders_test)
        tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
        logger.info(f"tn: {tn} fp: {fp}")
        logger.info(f"fn: {fn} tp:{tp}")

    def create_new_model(self):
        from simpletransformers.classification import ClassificationModel
        self.model = ClassificationModel('bert',
                                         'bert-base-german-cased',
                                         use_cuda=False,
                                         args=args)
Exemplo n.º 7
0
def transformer(train_df, eval_df, datafile):

    #tokenizer = BertTokenizer.from_pretrained("bert-base-dutch-cased")
    model = ClassificationModel(
        "bert", "bert-base-dutch-cased", use_cuda=False, num_labels=2
    )  # You can set class weights by using the optional weight argument

    # Train the model
    model.train_model(train_df)

    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
    print(model_outputs)

    predlist = []
    model1_outputs = model_outputs.tolist()
    for output in model1_outputs:
        if output[0] > output[1]:
            prediction = 0
        else:
            prediction = 1
        predlist.append(prediction)

    labels = eval_df["labels"].tolist()
    print(labels)
    print(predlist)

    print(classification_report(labels, predlist))
    print(confusion_matrix(labels, predlist))
    print(accuracy_score(labels, predlist))
def run_trainers(bucket_dir, train_args=None):

    os.makedirs('irl_models', exist_ok=True)

    if os.path.isfile('completed_irl.txt'):
        with open("completed_irl.txt", 'r') as f:
            done = [d.replace('\n', '') for d in f.readlines()]
    else:
        open('completed_irl.txt', 'a').close()
        with open("completed_irl.txt", 'r') as f:
            done = [d.replace('\n', '') for d in f.readlines()]
    for train_file in os.listdir(bucket_dir):
        print(train_file[5:])
        print(done)
        if train_file[5:] not in done:
            train_df = pd.read_csv(bucket_dir + '/' + train_file +
                                   '/data_all.tsv',
                                   sep='\t')
            train_args['output_dir'] = f'irl_models/{train_file[5:]}/'
            train_args['cache_dir'] = f'cache_{train_file[5:]}/'

            train_args.update({'wandb_kwargs': {'name': train_file[5:]}})

            model = ClassificationModel('roberta',
                                        'roberta-base',
                                        args=train_args)
            print(train_df.head())
            model.train_model(train_df)

            with open("completed_irl.txt", 'a') as f:
                f.write(f"{train_file[5:]}\n")
            exit()

    with open("done.runs", 'w') as f:
        f.write(f"Done at {datetime.datetime.now()}")
Exemplo n.º 9
0
def objective(args):
    pbar.update(1)
    try:
        # cast np values to python and convert list to dict
        args = list(map(int, args[:3])) + list(map(float, args[3:]))
        args = dict(
            zip([
                'train_batch_size', 'gradient_accumulation_steps',
                'weight_decay', 'learning_rate', 'learning_rate',
                'adam_epsilon', 'warmup_ratio', 'max_grad_norm'
            ], args))
        args['overwrite_output_dir'] = True
        args['eval_batch_size'] = args['train_batch_size']
        model = ClassificationModel('albert', 'albert-base-v1', num_labels=5)

        # train model, find reverse f1, force garbage collection
        model.train_model(train, args=args)
        result, *_ = model.eval_model(test,
                                      f1=f1_multiclass,
                                      acc=accuracy_score)
        del model
        return 1. - result['f1']
    except:
        print('skip')
        return 1.
Exemplo n.º 10
0
def main():
    # load train & test data
    df_train = pd.read_csv("sentiment_train.csv")
    df_test = pd.read_csv("sentiment_test.csv")

    #set random seed
    random = 42

    # Train test split
    X_train, X_val, y_train, y_val = train_test_split(df_train['Sentence'],
                                                      df_train['Polarity'],
                                                      test_size=0.10,
                                                      random_state=random)
    train_dataset = pd.concat([X_train, y_train], axis=1)
    val_dataset = pd.concat([X_val, y_val], axis=1)

    # Load a pre-trained model, and train it with our data | See all models available: https://huggingface.co/transformers/pretrained_models.html
    # Create model ... args = parameters
    args = {
        'reprocess_input_data': True,
        'max_seq_length': 300,
        'num_train_epochs': 1,
        'fp16': False,
        'train_batch_size': 4,
        'overwrite_output_dir': True
    }
    my_model = ClassificationModel('roberta',
                                   'distilroberta-base',
                                   num_labels=2,
                                   use_cuda=True,
                                   cuda_device=0,
                                   args=args)
    # Train the model
    my_model.train_model(train_dataset)

    # Evaluate the model
    result, model_outputs, wrong_predictions = my_model.eval_model(
        val_dataset, acc=f1_score)
    pred_val = np.argmax(model_outputs, axis=1).tolist()

    print("Results on evaluation:")
    print("----------------------")
    print("F1 Score = {:.6f}\n".format(
        f1_score(y_val, pred_val, average='micro') * 100))

    print(classification_report(y_val, pred_val))
    print(confusion_matrix(y_val, pred_val))

    # get results on test set
    pred_test, _ = my_model.predict(df_test['Sentence'])

    # print f1 score
    print(f1_score(df_test.Polarity, pred_test))

    # print accuracy score
    print(accuracy_score(df_test.Polarity, pred_test))

    # save input/ground truth/prediction as one csv
    df_test['prediction'] = pred_test
    df_test.to_csv('q3_ans.csv', index=False)
Exemplo n.º 11
0
def fake_classify(train_set, eval_set, test_set, seed):

    # Create a TransformerModel

    model = ClassificationModel('bert',
                                'bert-base-multilingual-uncased',
                                args={
                                    'num_train_epochs': 3,
                                    'overwrite_output_dir': True,
                                    'manual_seed': seed
                                },
                                use_cuda=True)
    print(model.args)
    # Train the model
    model.train_model(train_set)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_set,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score)
    #print('Evaluation results = ', results(results))

    #save the model

    #import torch
    #torch.save(model, path) --> no need to do this, model gets saved in output dir

    return result, model_outputs, wrong_predictions
Exemplo n.º 12
0
def main():
    f_path = 'Breast Cancer(Raw_data_2_Classes).csv'
    data = loadDataAsDataFrame(f_path)
    X = data
    y = data['Class'].tolist()
    training_set_size = int(0.8 * len(X))
    training_rows, test_rows, training_classes, test_classes = train_test_split(
        X, y, train_size=training_set_size, random_state=42069)
    training_rows, test_rows, training_classes, test_classes = train_test_split(
        X, y, train_size=training_set_size, random_state=42069)
    model_args = {'overwrite_output_dir': True}
    # Create a TransformerModel
    model = ClassificationModel('roberta',
                                'roberta-base',
                                use_cuda=False,
                                args=model_args)
    #model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, args=model_args)

    #change our data into a format that simpletransformers can process
    training_rows['text'] = training_rows['Text']
    training_rows['labels'] = training_rows['Class']
    test_rows['text'] = test_rows['Text']
    test_rows['labels'] = test_rows['Class']

    # Train the model
    model.train_model(training_rows)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(test_rows)

    print("f1 score")
    precision = result['tp'] / (result['tp'] + result['fp'])
    recall = result['tp'] / (result['tp'] + result['fn'])
    f1score = 2 * precision * recall / (precision + recall)
    print(f1score)
Exemplo n.º 13
0
def train(human_file, gen_file, our_gen_file, output_dir):
    data = []
    data += [(i.strip(), 1) for i in open(human_file,'r').readlines()]
    data += [(i.strip(), 0) for i in open(gen_file,'r').readlines()]
    data += [(i.strip(), 0) for i in open(our_gen_file,'r').readlines()]

    all_df = pd.DataFrame(data)

    train_args = {
    'overwrite_output_dir':True,
    'num_train_epochs':  10,
    'process_count': 10,
    'train_batch_size': 10,
    'eval_batch_size': 20,
    'max_seq_length': 300,
    'reprocess_input_data':True,
    'learning_rate':1e-5,
    "evaluate_during_training": True,
    "use_early_stopping":True,
    'early_stopping_patience':3,
    "early_stopping_metric": "eval_loss",
    "early_stopping_metric_minimize": True,
    "no_cache":True,
    'output_dir':output_dir
    }

    model = ClassificationModel('roberta', "roberta-base", args=train_args) # You can set class weights by using the optional weight argument

    # Train the model

    model.train_model(all_df)
    print("finish the training")
Exemplo n.º 14
0
def train_model(args, output_dir, cache_dir):
    """
    Train a SimpleTransformers model based on the given arguments, save and return it.
    :param args: Arguments as processed by parse_args() containing architecture and epochs.
    :param output_dir: Path to the directory in which the model should be stored.
    :param cache_dir: Path to the directory in which the cache should be stored.
    :return: SimpleTransformers model trained based on the given arguments.
    """
    print('=> Training model...')

    # Set model arguments
    model_args = {
        'num_train_epochs': args.num_epochs,
        'train_batch_size': 32,
        'eval_batch_size': 32,
        'output_dir': output_dir,
        'cache_dir': cache_dir
    }

    # Train the model
    pretrained = get_transformer_model(args.arch)
    model = ClassificationModel(args.arch,
                                pretrained,
                                use_cuda=True,
                                args=model_args)
    train = load_corpus('train')
    model.train_model(train)

    return model
            def train():
                wandb.init(WAND_PROJECT_NAME)
                modelArgs = { "max_seq_length": self.maxSeqLength, "output_dir": self.modelOutputDir, "overwrite_output_dir": True, "best_model_dir": self.bestModelOutputDir,
                              "wandb_project": WAND_PROJECT_NAME, "num_training_epochs": wandb.config.epochs, "learning_rate": wandb.config.learning_rate,
                              "do_lower_case": True, "cache_dir": self.modelCacheDir, "encoding": "utf-8", "train_batch_size": 5, "eval_batch_size": 5,
                              "evaluate_during_training_steps": 50, "evaluate_during_training_verbose": True, "logging_steps": 5, "sliding_window": True,
                              "reprocess_input_data": True, "evaluate_during_training": True, "use_multiprocessing": True,
                              "labels_list": SECTOR_LABELS }

                model = ClassificationModel(self.modelType, self.modelNameOrPath, args=modelArgs, sweep_config=wandb.config, use_cuda=torch.cuda.is_available(), num_labels=len(SECTOR_LABELS), )

                # Training and evaluation
                try:
                    log.info(f"Started training/finetuning BERT on multi-class classification task..")
                    model.train_model(train_df=self.trainDataset, eval_df=self.evalDataset, show_running_loss=True,
                                      output_dir=self.modelOutputDir,
                                      mcc=sklearn.metrics.matthews_corrcoef,
                                      acc=sklearn.metrics.balanced_accuracy_score, )
                    log.info(f"Finished finetuning and evaluating our fine-tuned model on multi-class classification task. Check the folder '{self.modelOutputDir}' for finetuned weights.")
                    log.info(f"It took {round((time.time() - startTime) / 3600, 1)} hours to finetune and evaluate our fine-tuned model on multi-class classification task.")
                except:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    err = f"Error occurred while training and evaluating the finetuned model on multi-class classification task. Error is: {exc_type}; {exc_value}."
                    log.error(err)

                wandb.join()
def test_binary_classification(model_type, model_name):
    # Train and Evaluation data needs to be in a Pandas Dataframe of two columns.
    # The first column is the text with type str, and the second column is the
    # label with type int.
    train_data = [
        ["Example sentence belonging to class 1", 1],
        ["Example sentence belonging to class 0", 0],
    ]
    train_df = pd.DataFrame(train_data)

    eval_data = [
        ["Example eval sentence belonging to class 1", 1],
        ["Example eval sentence belonging to class 0", 0],
    ]
    eval_df = pd.DataFrame(eval_data)

    # Create a ClassificationModel
    model = ClassificationModel(
        model_type,
        model_name,
        use_cuda=False,
        args={
            "reprocess_input_data": True,
            "overwrite_output_dir": True
        },
    )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(eval_df)
Exemplo n.º 17
0
def train_stance_clf(data_dir, output_dir, **kwargs):
    headlines, bodies, labels = fnc(
        os.path.join(data_dir, 'combined_stances_train.csv'),
        os.path.join(data_dir, 'combined_bodies_train.csv'))

    list_of_tuples = list(zip(headlines, bodies, labels))
    df = pd.DataFrame(list_of_tuples, columns=['text_a', 'text_b', 'label'])
    train_df, val_df = train_test_split(df, random_state=123)
    train_args = {
        'learning_rate': 3e-5,
        'num_train_epochs': 5,
        'reprocess_input_data': True,
        'overwrite_output_dir': False,
        'process_count': 10,
        'train_batch_size': 4,
        'eval_batch_size': 20,
        'max_seq_length': 300,
        "fp16": False,
        'output_dir': output_dir
    }

    model = ClassificationModel('roberta',
                                "roberta-base",
                                num_labels=4,
                                use_cuda=True,
                                cuda_device=0,
                                args=train_args)

    model.train_model(train_df)
def cross_pseudo_labeling(train, pseudo_test, test, params, n_folds,
                          model_name, model_type, lb_hack):
    splits = list(
        StratifiedKFold(n_splits=n_folds, shuffle=True,
                        random_state=1234).split(train["text"],
                                                 train["label"]))
    splits_test = list(
        KFold(n_splits=n_folds, shuffle=True,
              random_state=1234).split(test["jobflag"]))

    y_pred = np.zeros((test.shape[0], n_folds))
    oof = np.zeros(train.shape[0])
    oof_raw = np.zeros((train.shape[0], n_folds))
    weight = len(train) / train["label"].value_counts().sort_index().values

    f1_score = 0

    for fold, (train_idx, valid_idx) in enumerate(splits):
        X_train = pd.concat([train.iloc[train_idx], pseudo_test])
        X_valid = train.iloc[valid_idx]
        model = ClassificationModel(model_type=model_type,
                                    model_name=model_name,
                                    num_labels=4,
                                    args=params,
                                    use_cuda=True,
                                    weight=weight.tolist())

        model.train_model(X_train)

        result, model_outputs, wrong_predictions = model.eval_model(
            X_valid, f1=metric_f1)
        print(result)
        f1_score += result["f1"] / n_folds

        fold_pred, raw_outputs = model.predict(test["description"].values)
        # y_pred[:, fold] = hack(raw_outputs)
        y_pred[:, :] = raw_outputs / n_folds

        oof_pred, oof_outputs = model.predict(
            X_valid["text"].values)  # 謎のバグが発生するので変換
        oof[valid_idx] = oof_pred
        oof_raw[valid_idx, :] = oof_outputs
        # oof[valid_idx] = hack(oof_outputs)

    print(f"mean f1_score: {f1_score}")

    raw_pred = y_pred.copy()

    y_pred = hack(y_pred, lb_hack)

    # oof = hack(oof_raw)

    # y_pred = stats.mode(y_pred, axis=1)[0].flatten().astype(int)

    test_pred = pd.DataFrame(
        np.concatenate([y_pred.reshape(-1, 1), raw_pred], 1))
    oof_pred = pd.DataFrame(np.concatenate([oof.reshape(-1, 1), oof_raw], 1))

    return test_pred, f1_score, oof_pred
Exemplo n.º 19
0
def main(source=source,
         data_dir='data',
         checkpoint_dir="outputs/eval2/roberta_finetune_nogptneo",
         best_model_dir='outputs/eval2/best_model_roberta_finetune_nogptneo',
         n_train=240000,
         n_valid=4000,
         n_test=4000,
         n_epochs=10,
         learning_rate=4e-05,
         train_batch_size=64,
         eval_batch_size=64,
         evaluate_during_training=True,
         evaluate_during_training_steps=2000,
         reprocess_input=True,
         overwrite_output_dir=True,
         n_gpu=2):

    # import pdb; pdb.set_trace()
    train_df = data_loading.load_split(data_dir, source, 'train', n=n_train)
    valid_df = data_loading.load_split(data_dir,
                                       source_test,
                                       'valid',
                                       n=n_valid)
    test_df = data_loading.load_split(data_dir, source_test, 'test', n=n_test)

    # Optional model configuration
    model_args = ClassificationArgs(
        num_train_epochs=n_epochs,
        evaluate_during_training=evaluate_during_training,
        evaluate_during_training_steps=evaluate_during_training_steps,
        best_model_dir=best_model_dir,
        manual_seed=0,
        train_batch_size=train_batch_size,
        eval_batch_size=eval_batch_size,
        overwrite_output_dir=overwrite_output_dir,
        n_gpu=n_gpu,
        output_dir=checkpoint_dir,
        reprocess_input_data=reprocess_input,
        learning_rate=learning_rate)

    # Create a ClassificationModel
    model = ClassificationModel("roberta",
                                model_name="roberta-large",
                                args=model_args,
                                use_cuda=True)

    # Train the model
    model.train_model(train_df,
                      eval_df=valid_df,
                      f1=sklearn.metrics.f1_score,
                      acc=sklearn.metrics.accuracy_score,
                      eer=eer)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        test_df,
        f1=sklearn.metrics.f1_score,
        acc=sklearn.metrics.accuracy_score,
        eer=eer)
class TransformerModel(TenderClassClassifier):
    """
    This class provides the Machine Learning model and classifies tenders based on previous training data.
    """
    def __init__(self):
        self.model = None

    def load(self, name):
        self.model = ClassificationModel('bert',
                                         './outputs/',
                                         use_cuda=cuda_available,
                                         args=args)

    def save(self, name):
        pass

    def __convert_to_input(self, tenders):
        titles = list(map(lambda x: x.get_title("DE"), tenders))
        return titles

    def classify(self, tenders):
        titles = self.__convert_to_input(tenders)
        predictions, raw_output = self.model.predict(titles)
        tuples = zip(tenders, predictions)

        selected_tenders = [t for t, p in tuples if p == 1]
        return selected_tenders

    def train(self, labelled_tenders):
        tenders = [i for i, j in labelled_tenders]
        tenders = self.__convert_to_input(tenders)
        labels = [j for i, j in labelled_tenders]

        tenders_train, tenders_test, labels_train, labels_test = train_test_split(
            tenders, labels, test_size=0.1, random_state=42)

        data_input = pd.DataFrame(zip(tenders_train, labels_train))

        start = time.time()
        self.model.train_model(data_input)
        end = time.time()

        print(end - start)

        labels_pred, raw_output = self.model.predict(tenders_test)
        tn, fp, fn, tp = confusion_matrix(labels_test, labels_pred).ravel()
        logger.info(f"tn: {tn} fp: {fp}")
        logger.info(f"fn: {fn} tp:{tp}")

        logger.info(
            f"Accuracy Score: {accuracy_score(labels_test, labels_pred)}")

    def create_new_model(self):
        from simpletransformers.classification import ClassificationModel
        self.model = ClassificationModel('bert',
                                         'bert-base-german-cased',
                                         use_cuda=cuda_available,
                                         args=args)
Exemplo n.º 21
0
def finalmodel(outfolder):  # type: (str) -> None
    """
  Trains the BERT model using the parameters currently set in
  buildbertargs().  The parameters have been explored with a
  train/test split, so this training is with the full dataset.

  :param outfolder: the folder to write the model to
  :type outfolder: str
  """

    rawdata = helpers.refdf.copy(deep=True)
    print('Raw data: ' + str(rawdata.shape))
    rawdata.set_index('Clause ID', inplace=True)
    # sourcedata = helpers.dedupdf.copy(deep=True)
    # print('Deduped data: ' + str(sourcedata.shape))
    sourcedata = helpers.refdf.copy(deep=True)
    print('Raw data: ' + str(sourcedata.shape))
    sourcedata = sourcedata[sourcedata['Clause Text'].map(helpers.goodsize)]
    print('Sized data: ' + str(sourcedata.shape))
    sourcedata.set_index('Clause ID', inplace=True)

    traindata = pd.DataFrame(
        {
            'text': sourcedata['Clause Text'],
            'labels': sourcedata['Classification']
        },
        index=sourcedata.index)

    evaldata = pd.DataFrame(
        {
            'text': rawdata['Clause Text'],
            'labels': rawdata['Classification']
        },
        index=rawdata.index)

    print('Data for BERT: ' + str(traindata.shape))

    accargs = buildbertargs()
    accargs.output_dir = outfolder
    accmodel = ClassificationModel('roberta',
                                   'roberta-base',
                                   args=accargs,
                                   weight=[2, 1])
    accmodel.train_model(traindata)

    print('---------------')
    print('Training Data Eval:')

    result, model_outputs, wrong_predictions = accmodel.eval_model(traindata)
    print(result)

    print('---------------')
    print('Full Data Eval:')

    result, model_outputs, wrong_predictions = accmodel.eval_model(evaldata)
    # {'mcc': 0.9062028924099057, 'tp': 4835, 'tn': 1368, 'fp': 74, 'fn': 140, 'eval_loss': 0.18330956540325125}
    print(result)
Exemplo n.º 22
0
            def train():
                wandb.init(WAND_PROJECT_NAME)
                modelArgs = {
                    "max_seq_length": self.maxSeqLength,
                    "output_dir": self.modelOutputDir,
                    "overwrite_output_dir": True,
                    "best_model_dir": self.bestModelOutputDir,
                    "wandb_project": WAND_PROJECT_NAME,
                    "num_training_epochs": wandb.config.epochs,
                    "learning_rate": wandb.config.learning_rate,
                    "do_lower_case": True,
                    "cache_dir": self.modelCacheDir,
                    "encoding": "utf-8",
                    "train_batch_size": 5,
                    "eval_batch_size": 5,
                    "evaluate_during_training_steps": 50,
                    "evaluate_during_training_verbose": True,
                    "logging_steps": 5,
                    "sliding_window": True,
                    "reprocess_input_data": True,
                    "evaluate_during_training": True,
                    "use_multiprocessing": False,
                    "regression": True
                }
                model = ClassificationModel(self.modelType,
                                            self.modelNameOrPath,
                                            args=modelArgs,
                                            sweep_config=wandb.config,
                                            use_cuda=torch.cuda.is_available(),
                                            num_labels=1)

                # Training
                try:
                    log.info(
                        f"Started finetuning BERT on sentiment analysis/regression task.."
                    )
                    model.train_model(
                        train_df=self.trainDataFrame,
                        eval_df=self.evalDataFrame,
                        show_running_loss=True,
                        output_dir=self.modelOutputDir,
                        mse=sklearn.metrics.mean_squared_error,
                        r2Score=sklearn.metrics.r2_score,
                    )
                    log.info(
                        f"Finished training and evaluation of our finetuned model on sentiment analysis/regression task. Check the folder '{self.modelOutputDir}' for finetuned weights."
                    )
                    log.info(
                        f"It took {round((time.time() - startTime) / 3600, 1)} hours to train/finetune BERT model on sentiment analysis/regression task."
                    )
                except:
                    exc_type, exc_value, exc_traceback = sys.exc_info()
                    err = f"Error occurred while training finetuned model on sentiment analysis/regression task. Error is: {str(exc_type)}; {str(exc_value)}."
                    log.error(err)

                wandb.join()
    def run_trainer(self):
        logging.basicConfig(level=logging.INFO)
        transformers_logger = logging.getLogger("transformers")
        transformers_logger.setLevel(logging.WARNING)

        print('output dir: ' + self.output_dir)

        model_args = {
            'max_seq_length': self.max_seq_length,
            'learning_rate': 4e-5,
            'num_train_epochs': self.epochs,
            'reprocess_input_data': True,
            'overwrite_output_dir': True,
            'evaluate_during_training': True,
            'evaluate_during_training_steps': 800,  # 400
            'best_model_dir': '{}/best-models'.format(self.output_dir),
            'logging_steps': 100,  # 50
            'do_lower_case': True,
            'train_batch_size': self.batch_size,
            'use_batch_norm': False,
            'tensorboard_dir': '{}/runs'.format(self.output_dir),
            'early_stopping_patience': 1,
            'save_only_best': True,
            'overwrite_last_saved': True,
            'save_steps': 0,
            'wandb_project': 'gallery',
        }
        # Create a ClassificationModel
        model = ClassificationModel(self.model_name,
                                    self.model_name + "-base-uncased",
                                    num_labels=self.num_labels,
                                    args=model_args,
                                    use_cuda=self.use_cuda)

        # Train the model
        # model.train_model(self.train_df)
        # model.train_model(self.train_df, output_dir=output_dir, eval_df=test_x, acc=accuracy_score)
        model.train_model(self.train_df,
                          output_dir=self.output_dir,
                          eval_df=self.eval_df,
                          acc=accuracy_score)

        # Evaluate the model
        # eval_df, multi_label=False, output_dir=None, verbose=True, silent=False, wandb_log=True, **kwargs
        result, model_outputs, wrong_predictions = model.eval_model(
            eval_df=self.eval_df,
            multi_label=False,
            output_dir=self.output_dir,
            verbose=True,
            silent=False,
            wandb_log=True)

        print("result: ", result)
        return model
Exemplo n.º 24
0
def trainer(train_df,OUTPUT_DIR,preproc,args):
    script_dir = os.path.dirname(__file__)
    abs_file_path = os.path.join(script_dir, args.modelConf)
    with open(abs_file_path) as f:
        model_param = json.loads(f.read())
    model_param['output_dir'] = OUTPUT_DIR
    print(model_param)
    model_name = 'bert-large-cased'
    model = ClassificationModel('bert', model_name, args=model_param,num_labels=3)
    model.train_model(train_df);
    return model
Exemplo n.º 25
0
def get_bert_base(train_sequences,
                  dev_sequences,
                  train_targets,
                  dev_targets,
                  time_constraint=1,
                  num_cpu=1,
                  max_features=1000,
                  model="bert-base",
                  weights_dir="transformers_trained",
                  cuda=False):

    'text' 'labels'
    total_sequences_training = train_sequences.values.tolist(
    ) + dev_sequences.values.tolist()

    total_labels_training = train_targets.tolist() + dev_targets.tolist()

    train_df = pd.DataFrame()
    train_df['text'] = total_sequences_training
    train_df['labels'] = total_labels_training

    # Create a ClassificationModel
    if model == "bert-base":
        model = ClassificationModel('bert',
                                    'bert-base-cased',
                                    num_labels=len(set(total_labels_training)),
                                    args={
                                        'reprocess_input_data': True,
                                        'overwrite_output_dir': True,
                                        "output_hidden_states": True
                                    },
                                    use_cuda=cuda)

    elif model == "roberta-base":
        model = ClassificationModel('roberta',
                                    'roberta-base',
                                    num_labels=len(set(total_labels_training)),
                                    args={
                                        'output_hidden_states': True,
                                        'reprocess_input_data': True,
                                        'overwrite_output_dir': True
                                    },
                                    use_cuda=cuda)

    model.args['num_train_epochs'] = 1
    model.args['max_sequence_length'] = 256
    model.args['save_eval_checkpoints'] = False
    model.args['save_model_every_epoch'] = False
    model.args['output_dir'] = weights_dir
    model.args['save_steps'] = 400

    # Train the model
    model.train_model(train_df)
    return model
Exemplo n.º 26
0
class Classifier:
    def __init__(self, model_type, model_name, use_cuda=True):
        logging.basicConfig(level=logging.INFO)
        transformers_logger = logging.getLogger("transformers")
        transformers_logger.setLevel(logging.WARNING)

        # Create a ClassificationModel
        self.model_type = model_type
        self.model_name = model_name
        self.use_cuda = use_cuda
        self.dat = {}
        self.rerun = False

    def add(self, X, Y):
        self.dat[Y] = X

    def train(self, split=0.7, num_epochs=10):
        self.le = preprocessing.LabelEncoder()
        print(list(self.dat.keys()))
        self.le.fit(list(self.dat.keys()))

        train_data = []
        eval_data = []
        for k, v in self.dat.items():
            len_train = int(round(len(v) * split))
            train_data.extend([[i, self.le.transform([k])[0]]
                               for i in v[:len_train]])

            eval_data.extend([[i, self.le.transform([k])[0]]
                              for i in v[len_train:]])

        print(train_data, eval_data)
        train_df = pd.DataFrame(train_data)
        eval_df = pd.DataFrame(eval_data)
        train_args = {
            'overwrite_output_dir': True,
            'num_train_epochs': num_epochs,
        }
        self.model = ClassificationModel(self.model_type,
                                         self.model_name,
                                         num_labels=len(list(self.dat.keys())),
                                         use_cuda=self.use_cuda,
                                         cuda_device=0,
                                         args=train_args)
        # Train the model
        self.model.train_model(train_df, eval_df=eval_df)

        # Evaluate the model
        result, model_outputs, wrong_predictions = self.model.eval_model(
            eval_df, acc=sklearn.metrics.accuracy_score)

    def predict(self, x):
        predictions, raw_outputs = self.model.predict(x)
        return self.le.inverse_transform(predictions)
Exemplo n.º 27
0
def train(train_dataset, valid_dataset):
    logging.debug("Training is going to start")
    args = {}
    args.update(global_args["model_params"])
    model = ClassificationModel(
        "roberta",
        "models/best_model/",
        use_cuda=False,
        args=global_args["model_params"],
    )
    model.train_model(train_dataset, eval_df=valid_dataset)
    logging.debug("Training is done")
def main():
    script_info = pd.read_csv('./data/IMSDB/final_movie_budgets.csv', sep=',')
    script_info['Budget'] = [
        int(bud.replace(',', '')) for bud in script_info['Budget']
    ]  # reformatting budget

    # creating Budget Categories by quartile
    script_info['Bud_Cat'] = pd.qcut(script_info['Budget'], 2, labels=[0, 1])

    # get list of scripts from data folder
    scripts = []
    for file in script_info['Filename']:
        with open(file, 'r') as txt:
            scripts.append(txt.read().replace('\n', ''))

    X_train, X_test, y_train, y_test = train_test_split(scripts,
                                                        script_info['Bud_Cat'],
                                                        test_size=0.2,
                                                        random_state=0)

    docs = [
        ' '.join(tokenize_script(script, stop_words=True))
        for script in X_train
    ]
    train_docs = [list(x) for x in zip(docs, y_train)]

    train_df = pd.DataFrame(train_docs)
    train_df.columns = ["text", "labels"]

    docs = [
        ' '.join(tokenize_script(script, stop_words=True)) for script in X_test
    ]
    test_docs = [list(x) for x in zip(docs, y_test)]

    test_df = pd.DataFrame(test_docs)
    test_df.columns = ["text", 'labels']

    model_args = ClassificationArgs(sliding_window=True,
                                    overwrite_output_dir=True)

    model = ClassificationModel("roberta",
                                "roberta-base",
                                args=model_args,
                                use_cuda=True,
                                n_epochs=3)

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(test_df)

    print(result)
Exemplo n.º 29
0
def run(model_name=("distilbert", "distilbert-base-uncased")):
    # TODO: make directories in VM
    training_data, test_data = get_dataset()
    model = ClassificationModel(model_name[0], model_name[1])
    output_dir_train = "./saved_states/category3/" + model_name[0]
    output_dir_eval = "./results/category3/" + model_name[0]
    # create paths if they do not exist
    from pathlib import Path
    Path(output_dir_train).mkdir(parents=True, exist_ok=True)
    Path(output_dir_eval).mkdir(parents=True, exist_ok=True)
    model.train_model(training_data, args={"overwrite_output_dir": True}, output_dir=output_dir_train)
    result, model_outputs, wrong_predictions = model.eval_model(test_data, output_dir=output_dir_eval)
Exemplo n.º 30
0
def train_model(model_type, model_name, training_size):
    print('Starting run:', model_type, model_name, training_size)

    train_df = pd.read_csv("data/train_" + training_size + ".csv", header=None)

    train_df["text"] = train_df.iloc[:, 1] + " " + train_df.iloc[:, 2]
    train_df = train_df.drop(train_df.columns[[1, 2]], axis=1)
    train_df.columns = ["labels", "text"]

    train_df["text"] = train_df["text"].apply(lambda x: x.replace("\\", " "))
    train_df["labels"] = train_df["labels"].apply(lambda x: x - 1)

    eval_df = pd.read_csv("data/test.csv", header=None)
    eval_df["text"] = eval_df.iloc[:, 1] + " " + eval_df.iloc[:, 2]
    eval_df = eval_df.drop(eval_df.columns[[1, 2]], axis=1)
    eval_df.columns = ["labels", "text"]

    eval_df["text"] = eval_df["text"].apply(lambda x: x.replace("\\", " "))
    eval_df["labels"] = eval_df["labels"].apply(lambda x: x - 1)

    t0 = time.time()
    train_args = {
        'output_dir':
        f'model-outputs/{model_type}-{model_name}-{training_size}-outputs',
        'max_seq_length': 256,
        'num_train_epochs': 3,
        'train_batch_size': 16,
        'eval_batch_size': 32,
        'gradient_accumulation_steps': 1,
        'learning_rate': 5e-5,
        'save_steps': 50000,
        'evaluate_during_training': True,
        'evaluate_during_training_steps': 1000,
        'reprocess_input_data': True,
        'save_model_every_epoch': False,
        'overwrite_output_dir': True,
        'no_cache': True,
        'use_early_stopping': True,
        'early_stopping_patience': 3,
        'manual_seed': 4,
    }

    model = ClassificationModel(model_type,
                                model_name,
                                num_labels=4,
                                args=train_args)
    model.train_model(train_df, eval_df=eval_df)
    print('Run finished')
    t1 = time.time()
    total = t1 - t0

    print('Time:', total)
    print('--------------------')