Exemplo n.º 1
0
def train(train, test, config, seed=42, regularization=True):

    n_iter = int(config.get('Logistic Regression', 'MaxIterations'))
    num_cv = int(config.get('Logistic Regression', 'GridCV'))

    train_x, train_y = train
    test_x, test_y = test
    cl = np.unique(train_y)
    num_class = len(cl)

    if regularization:
        grid = {"C": np.logspace(-3, 3, 50), "penalty": ["l1"]}
        clf = GridSearchCV(LogisticRegression(solver="saga", max_iter=n_iter),
                           grid,
                           cv=StratifiedKFold(num_cv).split(train_x, train_y))
        clf.fit(train_x, train_y)
        clf = clf.best_estimator_
    else:
        clf = LogisticRegression(solver="saga", max_iter=n_iter)
        clf.fit(train_x, train_y)

    test_probs = clf.predict_proba(test_x)
    test_preds = np.argmax(clf.predict_proba(test_x), axis=1)

    test_stat_dict = get_stat_dict(test_y, test_probs, test_preds)

    weights = np.array(clf.coef_)

    return clf, test_stat_dict, weights
Exemplo n.º 2
0
 def test(self, test):
     test_x, test_y = test
     num_class = test_y.shape[1]
     probs = self.model.predict(test_x)
     preds = np.argmax(probs, axis=1)
     stats = get_stat_dict(np.argmax(test_y, axis=1), probs, preds)
     return stats
Exemplo n.º 3
0
def train(train, test, config, seed=42):

    number_trees = int(config.get('RF', 'NumberTrees'))
    num_models = int(config.get('RF', 'ValidationModels'))

    x, y = train
    test_x, test_y = test
    
    clf = RandomForestClassifier(n_estimators=number_trees, n_jobs=-1)
    clf.fit(x, y)
    
    feature_importance = clf.feature_importances_

    test_probs = np.array([row for row in clf.predict_proba(test_x)])
    test_pred = np.argmax(test_probs, axis=-1)

    test_stat_dict = get_stat_dict(test_y, test_probs, test_pred)
    

    return clf, test_stat_dict, feature_importance
Exemplo n.º 4
0
def train(train, test, config, seed=42, gaussian=False):

    num_cv = int(config.get('SVM', 'GridCV'))
    max_iter = int(config.get('SVM', 'MaxIterations'))

    train_x, train_y = train
    test_x, test_y = test
    cl = np.unique(train_y)
    num_class = len(cl)

    if num_class > 2:
        train_y_binarize = label_binarize(train_y, classes=cl)
        test_y_binarize = label_binarize(test_y, classes=cl)

        if gaussian == True:
            grid = [{
                'estimator__kernel': ['rbf'],
                'estimator__gamma': [1e-3, 1e-4],
                'estimator__C': [1, 10, 100, 1000]
            }, {
                'estimator__kernel': ['linear'],
                'estimator__C': [1, 10, 100, 1000]
            }]
        else:
            grid = [{
                'estimator__C': [1, 10, 100, 1000],
                'estimator__kernel': ['linear']
            }]

        clf = GridSearchCV(OneVsRestClassifier(
            SVC(probability=True, max_iter=max_iter)),
                           grid,
                           cv=StratifiedKFold(num_cv).split(train_x, train_y),
                           scoring="roc_auc",
                           n_jobs=-1)
        clf.fit(train_x, train_y_binarize)

    else:
        if gaussian == True:
            grid = [{
                'kernel': ['rbf'],
                'gamma': [1e-3, 1e-4],
                'C': [1, 10, 100, 1000]
            }, {
                'kernel': ['linear'],
                'C': [1, 10, 100, 1000]
            }]
        else:
            grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}]

        clf = GridSearchCV(SVC(probability=True, max_iter=max_iter),
                           grid,
                           cv=StratifiedKFold(num_cv).split(train_x, train_y),
                           scoring="roc_auc",
                           n_jobs=-1)

        clf.fit(train_x, train_y)

    test_probs = clf.predict_proba(test_x)
    test_preds = np.argmax(clf.predict_proba(test_x), axis=1)

    test_stat_dict = get_stat_dict(test_y, test_probs, test_preds)

    if num_class == 2 and gaussian == False:
        weights = np.array(clf.best_estimator_.coef_).reshape(-1)
    elif num_class > 2 and gaussian == False:
        weights = np.array(clf.best_estimator_.coef_)
    elif gaussian == True:
        weights = None

    return clf.best_estimator_, test_stat_dict, weights
Exemplo n.º 5
0
        if normalization == "Standard":
            test = np.clip(scaler.transform(log_external_sub), -3, 3)

        external_labels = []
        for lab in external_labels_df:
            external_labels.append(np.where(label_set == lab)[0])
        external_labels = np.array(external_labels).reshape(-1)
        external_labels_oh = np.eye(n_values)[external_labels]

        external_data = (test, external_labels)
        external_data_oh = (test, external_labels_oh)

        if train_rf == "True":
            pred = np.argmax(rf_clf.predict_proba(test), axis=1)
            probs = rf_clf.predict_proba(test)
            rf_stats = get_stat_dict(external_labels, probs, pred)
            results_external_sub_df.loc["AUC"]["RF"] = "{:.2f}".format(
                rf_stats["AUC"])
            results_external_sub_df.loc["MCC"]["RF"] = "{:.2f}".format(
                rf_stats["MCC"])
            results_external_sub_df.loc["Precision"]["RF"] = "{:.2f}".format(
                rf_stats["Precision"])
            results_external_sub_df.loc["Recall"]["RF"] = "{:.2f}".format(
                rf_stats["Recall"])
            results_external_sub_df.loc["F1"]["RF"] = "{:.2f}".format(
                rf_stats["F1"])

        if train_svm == "True":
            pred = np.argmax(svm_clf.predict_proba(test), axis=1)
            probs = svm_clf.predict_proba(test)
            svm_stats = get_stat_dict(external_labels, probs, pred)
Exemplo n.º 6
0
def tune_mlpnn(train, test, config, train_weights=[]):

    train_x, train_y = train
    test_x, test_y = test
    num_class = train_y.shape[1]
    input_len = train_x.shape[1]

    def auc_metric(y_true, y_pred):
        return tf.numpy_function(roc_auc_score, (y_true, y_pred), tf.double)

    dropout = [0.1, 0.3, 0.5]
    l2_grid = [0.01, 0.001, 0.0001]
    num_layer = [1, 2]
    num_nodes = [32, 64, 128]

    best_l2 = 0.0001
    best_drop = 0.5
    best_layer = 2
    best_nodes = 128

    best_stat = 0

    for d in dropout:
        for l in l2_grid:
            reg = tf.keras.regularizers.l2(l)
            model = tf.keras.Sequential()

            for i in range(0, best_layer):
                model.add(
                    tf.keras.layers.Dense(best_nodes,
                                          activation='relu',
                                          kernel_regularizer=reg,
                                          bias_regularizer=reg,
                                          name="fc_" + str(i)))
                model.add(tf.keras.layers.Dropout(d))

            model.add(
                tf.keras.layers.Dense(num_class,
                                      activation='softmax',
                                      kernel_regularizer=reg,
                                      bias_regularizer=reg,
                                      name="output"))

            patience = int(config.get('MLPNN', 'Patience'))
            batch_size = int(config.get('MLPNN', 'BatchSize'))
            learning_rate = float(config.get('MLPNN', 'LearningRate'))

            es_cb = tf.keras.callbacks.EarlyStopping('val_loss',
                                                     patience=patience,
                                                     restore_best_weights=True)
            model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                          loss='categorical_crossentropy')

            model.fit(train_x,
                      train_y,
                      batch_size=batch_size,
                      verbose=1,
                      epochs=1000,
                      callbacks=[es_cb],
                      validation_split=0.1)
            model.fit(train_x,
                      train_y,
                      batch_size=batch_size,
                      verbose=1,
                      epochs=10)

            probs = model.predict(test_x)
            preds = np.argmax(probs, axis=1)
            stat = get_stat_dict(np.argmax(test_y, axis=1), probs, preds)

            if stat["AUC"] > best_stat:
                best_stat = stat["AUC"]
                best_drop = d
                best_l2 = l
            tf.reset_default_graph()
            tf.keras.backend.clear_session()

    for l in num_layer:
        for n in num_nodes:

            reg = tf.keras.regularizers.l2(best_l2)
            model = tf.keras.Sequential()

            for i in range(0, l):
                model.add(
                    tf.keras.layers.Dense(n,
                                          activation='relu',
                                          kernel_regularizer=reg,
                                          bias_regularizer=reg,
                                          name="fc_" + str(i)))
                model.add(tf.keras.layers.Dropout(best_drop))

            model.add(
                tf.keras.layers.Dense(num_class,
                                      activation='softmax',
                                      kernel_regularizer=reg,
                                      bias_regularizer=reg,
                                      name="output"))

            patience = int(config.get('MLPNN', 'Patience'))
            batch_size = int(config.get('MLPNN', 'BatchSize'))

            es_cb = tf.keras.callbacks.EarlyStopping('val_loss',
                                                     patience=patience,
                                                     restore_best_weights=True)
            model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
                          loss='categorical_crossentropy')

            model.fit(train_x,
                      train_y,
                      batch_size=batch_size,
                      verbose=0,
                      epochs=1000,
                      callbacks=[es_cb],
                      validation_split=0.1)
            model.fit(train_x,
                      train_y,
                      batch_size=batch_size,
                      verbose=0,
                      epochs=10)

            probs = model.predict(test_x)
            preds = np.argmax(probs, axis=1)
            stat = get_stat_dict(np.argmax(test_y, axis=1), probs, preds)

            if stat["AUC"] > best_stat:
                best_stat = stat["AUC"]
                best_layer = l
                best_nodes = n
            tf.reset_default_graph()
            tf.keras.backend.clear_session()

    return best_layer, best_nodes, best_l2, best_drop