def train(train, test, config, seed=42, regularization=True): n_iter = int(config.get('Logistic Regression', 'MaxIterations')) num_cv = int(config.get('Logistic Regression', 'GridCV')) train_x, train_y = train test_x, test_y = test cl = np.unique(train_y) num_class = len(cl) if regularization: grid = {"C": np.logspace(-3, 3, 50), "penalty": ["l1"]} clf = GridSearchCV(LogisticRegression(solver="saga", max_iter=n_iter), grid, cv=StratifiedKFold(num_cv).split(train_x, train_y)) clf.fit(train_x, train_y) clf = clf.best_estimator_ else: clf = LogisticRegression(solver="saga", max_iter=n_iter) clf.fit(train_x, train_y) test_probs = clf.predict_proba(test_x) test_preds = np.argmax(clf.predict_proba(test_x), axis=1) test_stat_dict = get_stat_dict(test_y, test_probs, test_preds) weights = np.array(clf.coef_) return clf, test_stat_dict, weights
def test(self, test): test_x, test_y = test num_class = test_y.shape[1] probs = self.model.predict(test_x) preds = np.argmax(probs, axis=1) stats = get_stat_dict(np.argmax(test_y, axis=1), probs, preds) return stats
def train(train, test, config, seed=42): number_trees = int(config.get('RF', 'NumberTrees')) num_models = int(config.get('RF', 'ValidationModels')) x, y = train test_x, test_y = test clf = RandomForestClassifier(n_estimators=number_trees, n_jobs=-1) clf.fit(x, y) feature_importance = clf.feature_importances_ test_probs = np.array([row for row in clf.predict_proba(test_x)]) test_pred = np.argmax(test_probs, axis=-1) test_stat_dict = get_stat_dict(test_y, test_probs, test_pred) return clf, test_stat_dict, feature_importance
def train(train, test, config, seed=42, gaussian=False): num_cv = int(config.get('SVM', 'GridCV')) max_iter = int(config.get('SVM', 'MaxIterations')) train_x, train_y = train test_x, test_y = test cl = np.unique(train_y) num_class = len(cl) if num_class > 2: train_y_binarize = label_binarize(train_y, classes=cl) test_y_binarize = label_binarize(test_y, classes=cl) if gaussian == True: grid = [{ 'estimator__kernel': ['rbf'], 'estimator__gamma': [1e-3, 1e-4], 'estimator__C': [1, 10, 100, 1000] }, { 'estimator__kernel': ['linear'], 'estimator__C': [1, 10, 100, 1000] }] else: grid = [{ 'estimator__C': [1, 10, 100, 1000], 'estimator__kernel': ['linear'] }] clf = GridSearchCV(OneVsRestClassifier( SVC(probability=True, max_iter=max_iter)), grid, cv=StratifiedKFold(num_cv).split(train_x, train_y), scoring="roc_auc", n_jobs=-1) clf.fit(train_x, train_y_binarize) else: if gaussian == True: grid = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] else: grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']}] clf = GridSearchCV(SVC(probability=True, max_iter=max_iter), grid, cv=StratifiedKFold(num_cv).split(train_x, train_y), scoring="roc_auc", n_jobs=-1) clf.fit(train_x, train_y) test_probs = clf.predict_proba(test_x) test_preds = np.argmax(clf.predict_proba(test_x), axis=1) test_stat_dict = get_stat_dict(test_y, test_probs, test_preds) if num_class == 2 and gaussian == False: weights = np.array(clf.best_estimator_.coef_).reshape(-1) elif num_class > 2 and gaussian == False: weights = np.array(clf.best_estimator_.coef_) elif gaussian == True: weights = None return clf.best_estimator_, test_stat_dict, weights
if normalization == "Standard": test = np.clip(scaler.transform(log_external_sub), -3, 3) external_labels = [] for lab in external_labels_df: external_labels.append(np.where(label_set == lab)[0]) external_labels = np.array(external_labels).reshape(-1) external_labels_oh = np.eye(n_values)[external_labels] external_data = (test, external_labels) external_data_oh = (test, external_labels_oh) if train_rf == "True": pred = np.argmax(rf_clf.predict_proba(test), axis=1) probs = rf_clf.predict_proba(test) rf_stats = get_stat_dict(external_labels, probs, pred) results_external_sub_df.loc["AUC"]["RF"] = "{:.2f}".format( rf_stats["AUC"]) results_external_sub_df.loc["MCC"]["RF"] = "{:.2f}".format( rf_stats["MCC"]) results_external_sub_df.loc["Precision"]["RF"] = "{:.2f}".format( rf_stats["Precision"]) results_external_sub_df.loc["Recall"]["RF"] = "{:.2f}".format( rf_stats["Recall"]) results_external_sub_df.loc["F1"]["RF"] = "{:.2f}".format( rf_stats["F1"]) if train_svm == "True": pred = np.argmax(svm_clf.predict_proba(test), axis=1) probs = svm_clf.predict_proba(test) svm_stats = get_stat_dict(external_labels, probs, pred)
def tune_mlpnn(train, test, config, train_weights=[]): train_x, train_y = train test_x, test_y = test num_class = train_y.shape[1] input_len = train_x.shape[1] def auc_metric(y_true, y_pred): return tf.numpy_function(roc_auc_score, (y_true, y_pred), tf.double) dropout = [0.1, 0.3, 0.5] l2_grid = [0.01, 0.001, 0.0001] num_layer = [1, 2] num_nodes = [32, 64, 128] best_l2 = 0.0001 best_drop = 0.5 best_layer = 2 best_nodes = 128 best_stat = 0 for d in dropout: for l in l2_grid: reg = tf.keras.regularizers.l2(l) model = tf.keras.Sequential() for i in range(0, best_layer): model.add( tf.keras.layers.Dense(best_nodes, activation='relu', kernel_regularizer=reg, bias_regularizer=reg, name="fc_" + str(i))) model.add(tf.keras.layers.Dropout(d)) model.add( tf.keras.layers.Dense(num_class, activation='softmax', kernel_regularizer=reg, bias_regularizer=reg, name="output")) patience = int(config.get('MLPNN', 'Patience')) batch_size = int(config.get('MLPNN', 'BatchSize')) learning_rate = float(config.get('MLPNN', 'LearningRate')) es_cb = tf.keras.callbacks.EarlyStopping('val_loss', patience=patience, restore_best_weights=True) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy') model.fit(train_x, train_y, batch_size=batch_size, verbose=1, epochs=1000, callbacks=[es_cb], validation_split=0.1) model.fit(train_x, train_y, batch_size=batch_size, verbose=1, epochs=10) probs = model.predict(test_x) preds = np.argmax(probs, axis=1) stat = get_stat_dict(np.argmax(test_y, axis=1), probs, preds) if stat["AUC"] > best_stat: best_stat = stat["AUC"] best_drop = d best_l2 = l tf.reset_default_graph() tf.keras.backend.clear_session() for l in num_layer: for n in num_nodes: reg = tf.keras.regularizers.l2(best_l2) model = tf.keras.Sequential() for i in range(0, l): model.add( tf.keras.layers.Dense(n, activation='relu', kernel_regularizer=reg, bias_regularizer=reg, name="fc_" + str(i))) model.add(tf.keras.layers.Dropout(best_drop)) model.add( tf.keras.layers.Dense(num_class, activation='softmax', kernel_regularizer=reg, bias_regularizer=reg, name="output")) patience = int(config.get('MLPNN', 'Patience')) batch_size = int(config.get('MLPNN', 'BatchSize')) es_cb = tf.keras.callbacks.EarlyStopping('val_loss', patience=patience, restore_best_weights=True) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy') model.fit(train_x, train_y, batch_size=batch_size, verbose=0, epochs=1000, callbacks=[es_cb], validation_split=0.1) model.fit(train_x, train_y, batch_size=batch_size, verbose=0, epochs=10) probs = model.predict(test_x) preds = np.argmax(probs, axis=1) stat = get_stat_dict(np.argmax(test_y, axis=1), probs, preds) if stat["AUC"] > best_stat: best_stat = stat["AUC"] best_layer = l best_nodes = n tf.reset_default_graph() tf.keras.backend.clear_session() return best_layer, best_nodes, best_l2, best_drop