def __init__(self):
        self.params = {
            'description': 'Meta Stacking',
            'meta_params': {
                'penalty': 'l2',
                'dual': False,
                'tol': 0.0001,
                'C': 100,
                'fit_intercept': True,
                'intercept_scaling': 1,
                'class_weight': 'balanced',
                'solver': 'sag',
                'max_iter': 20000,
                'multi_class': 'multinomial',
                'random_state': 0,
                'verbose': 0,
                'warm_start': False,
                'n_jobs': get_n_jobs()
            },
        }

        self.stack = [
            (LogRegIVector(), False),
            (LogRegOpensmile(), False),
            (LogRegBertClsToken(), False),
            (LogRegNGram(), False),
            (LogRegLIWC(), False),
        ]

        self.params['base_estimators'] = [predictor.params for predictor, _ in self.stack]
Пример #2
0
    def __init__(self):
        self.params = {
            'description':
            'Logistic Regression with LIWC + Opensmile + TFIDF n-gram + IVectors + Authors + BERT_CLS_TOKEN concatenated features',
            'min_max_scaler_params': {
                'feature_range': (0, 1),
            },
            'tfidf_params': {
                'ngram_range': (1, 4)
            },
            'log_reg': {
                'penalty': 'l2',
                'dual': False,
                'tol': 0.0001,
                'C': 0.0001,
                'fit_intercept': True,
                'intercept_scaling': 1,
                'class_weight': 'balanced',
                'solver': 'sag',
                'max_iter': 20000,
                'multi_class': 'multinomial',
                'random_state': 0,
                'verbose': 0,
                'warm_start': False,
                'n_jobs': get_n_jobs()
            }
        }

        self.bert_client = BertClient()
Пример #3
0
def grid_search(clf, params, x, y, positions):
    gs_clf = GridSearchCV(clf,
                          params,
                          n_jobs=get_n_jobs(),
                          scoring='accuracy',
                          verbose=1,
                          cv=2)
    gs_clf.fit(x, y, positions)

    print("Best parameters:")
    print(gs_clf.best_params_)
    print("Best score: %0.3f" % gs_clf.best_score_)

    return gs_clf.best_estimator_, gs_clf.best_score_
Пример #4
0
 def __init__(self):
     self.params = {
         'description': 'Logistic Regression - Authors',
         'log_reg': {
             'penalty': 'l2',
             'dual': False,
             'tol': 0.001,
             'C': 0.001,
             'fit_intercept': True,
             'intercept_scaling': 1,
             'class_weight': 'balanced',
             'solver': 'sag',
             'max_iter': 20000,
             'multi_class': 'multinomial',
             'random_state': 0,
             'verbose': 0,
             'warm_start': False,
             'n_jobs': get_n_jobs()
         }
     }
Пример #5
0
 def __init__(self):
     self.params = {
         'description': 'Logistic Regression with Opensmile ComParE',
         'min_max_scaler_params': {
             'feature_range': (0, 1),
         },
         'log_reg': {
             'penalty': 'l2',
             'dual': False,
             'tol': 0.0001,
             'C': 0.0001,
             'fit_intercept': True,
             'intercept_scaling': 1,
             'class_weight': 'balanced',
             'random_state': 0,
             'solver': 'sag',
             'max_iter': 10000,
             'multi_class': 'multinomial',
             'verbose': 0,
             'warm_start': False,
             'n_jobs': get_n_jobs()
         }
     }
 def __init__(self):
     self.params = {
         'description': 'Logistic Regression - Character NGram TF-IDF',
         'tfidf_params': {
             'ngram_range': (2, 7),
             'analyzer': 'char',
         },
         'log_reg': {
             'penalty': 'l2',
             'dual': False,
             'tol': 0.0001,
             'C': 1,
             'fit_intercept': True,
             'intercept_scaling': 1,
             'class_weight': 'balanced',
             'solver': 'sag',
             'max_iter': 20000,
             'multi_class': 'multinomial',
             'random_state': 0,
             'verbose': 0,
             'warm_start': False,
             'n_jobs': get_n_jobs()
         }
     }
Пример #7
0
def main(estimator=MLP,
         cv_split=5,
         with_cross_validation=True,
         with_validation=False,
         with_test=False,
         with_external_data=False,
         validate_on_external=False,
         with_grid_search=False,
         with_full_data_tfidf=False,
         train_with_validation=False,
         predict_breaches=True,
         train_on_breach=True,
         cutoff=None):

    if validate_on_external:
        train_x, validation_x, train_y, validation_y = get_external_data(
            TRAINING_EXTERNAL_FILE, 3000, 1500)
        train_positions = []
    elif train_on_breach:
        train_x, train_y, train_positions, train_file_names = get_data(
            main_dir=BREACH_DIR, external_file=None, breach=True)
    else:
        train_x, train_y, train_positions, train_file_names = get_data(
            main_dir=TRAINING_DIR,
            external_file=TRAINING_EXTERNAL_FILE
            if with_external_data else None)

        # return print_splits(train_x[:3000], train_positions[:3000])

        if train_with_validation:
            validation_x, validation_y, validation_positions, validation_file_names = get_data(
                main_dir=VALIDATION_DIR)
            train_x.extend(validation_x)
            train_y.extend(validation_y)

        if cutoff:
            train_x = train_x[:cutoff]
            train_y = train_y[:cutoff]
            train_positions = train_positions[:cutoff]

    print("Training on {0} examples".format(len(train_x)))

    clf, cv, val, gs = None, None, None, None

    if estimator:
        clf = estimator()

    if with_cross_validation:
        if with_full_data_tfidf:
            skf = StratifiedKFold(n_splits=cv_split,
                                  random_state=42,
                                  shuffle=True)
            all_acc = []
            X = np.array(train_x)
            y = np.array(train_y)
            for train_index, test_index in skf.split(X, y):
                y_train, y_test = y[train_index], y[test_index]
                X_train, X_test = X[train_index], X[test_index]
                print(X_train.shape)

                clf.fit_with_test(X_train.tolist(), y_train, train_positions,
                                  X_test.tolist())
                predictions = clf.predict(X_test.tolist())
                all_acc.append(accuracy_score(y_test, predictions))

            print("Accuracies:", all_acc)
            print("Mean:", np.mean(all_acc))
            print("Stdev:", np.std(all_acc))

        elif train_on_breach:
            skf = StratifiedKFold(n_splits=cv_split,
                                  random_state=42,
                                  shuffle=True)
            f_scores = []
            diff = []
            r = []
            p = []
            all_acc = []
            X = np.array(train_x)
            y = np.array(train_y)
            pos = np.array(train_positions)
            for train_index, test_index in skf.split(X, y):
                y_train, y_test = y[train_index], y[test_index]
                X_train, X_test = X[train_index], X[test_index]
                pos_train, pos_test = pos[train_index], pos[test_index]
                print(X_train.shape)

                clf.fit_with_test(X_train.tolist(), y_train, train_positions,
                                  X_test.tolist())

                change_predictions = clf.predict(X_test.tolist())
                tn, fp, fn, tp = confusion_matrix(y_test,
                                                  change_predictions).ravel()
                print('tn: {}, fp: {}, fn: {}, tp: {}'.format(tn, fp, fn, tp))
                all_acc.append(accuracy_score(y_test, change_predictions))
                predictions = get_breach_predictions(clf, X_test.tolist(),
                                                     change_predictions)
                totalWinDiff, totalWinR, totalWinP, totalWinF, outStr = evaluate(
                    X_test, pos_test, predictions)
                print("%s" % outStr)
                diff.append(totalWinDiff)
                r.append(totalWinR)
                p.append(totalWinP)
                f_scores.append(totalWinF)

            print("Mean diff:", np.mean(diff))
            print("Mean r:", np.mean(r))
            print("Mean p:", np.mean(p))
            print("Mean f:", np.mean(f_scores))

            print("Accuracies:", all_acc)
            print("Mean:", np.mean(all_acc))
            print("Stdev:", np.std(all_acc))

            # for m in all_measures:
            #     print("%s" % m)
            #     print('----------------------------------')

        else:
            cv = cross_validate(
                estimator=clf,
                X=train_x,
                y=train_y,
                fit_params={'train_positions': train_positions},
                cv=cv_split,
                scoring="accuracy",
                n_jobs=get_n_jobs(),
                return_train_score=True)

    if with_grid_search:
        clf, best_score = __grid_search(clf, clf.get_grid_params(), train_x,
                                        train_y)
        gs = {'accuracy': best_score}

    if with_validation:
        if not validate_on_external:
            validation_x, validation_y, validation_positions, validation_file_names = get_data(
                main_dir=VALIDATION_DIR)

        if cutoff:
            validation_x = validation_x[:cutoff]
            validation_y = validation_y[:cutoff]

        t_start = time.time()
        if with_full_data_tfidf:
            clf.fit_with_test(train_x, train_y, train_positions, validation_x)
        else:
            clf.fit(train_x, train_y, train_positions)

        if predict_breaches:
            predictions = get_breach_predictions(clf, validation_x,
                                                 validation_y)
        else:
            predictions = clf.predict(validation_x)
        t_end = time.time()

        if predict_breaches:
            persist_output(OUTPUT_DIR,
                           predictions,
                           validation_file_names,
                           breach=predict_breaches)
            print("%s" %
                  evaluate(validation_x, validation_positions, predictions))
        else:
            print(ConfusionMatrix(validation_y, predictions))

            val = {
                'accuracy': accuracy_score(validation_y, predictions),
                'time': t_end - t_start
            }

    if with_test:
        test(clf, train_x, train_y, train_positions, with_full_data_tfidf)

    results = get_results(len(train_x),
                          clf_params=clf.params,
                          cv=cv,
                          val=val,
                          gs=gs)
    print(results)

    if config_local().get('persist_results', False):
        write_results_to_file(results)