Пример #1
0
def reports(classifier, train_data, train_labels):
    kf = KFold(n_splits=5)
    kf.get_n_splits(train_data)
    print(kf)
    scores = []
    for train_index, test_index in kf.split(train_data):
        #print("TRAIN:", len(train_index), "TEST:", len(test_index))
        X_train, X_test = train_data[train_index], train_data[test_index]
        y_train, y_test = train_labels[train_index], train_labels[test_index]
        classifier.fit(X_train, y_train)
        predicted = classifier.predict(X_test)
        scores.append(accuracy_score(predicted, y_test))
    scores = np.array(scores)
    print("Average Accuracy K Fold: ", scores.mean())
    train_data_len = len(train_data)
    chunksize = int(train_data_len * train_test_split_ratio)

    train_x = train_data[0:chunksize]
    train_y = train_labels[0:chunksize]

    test_x = train_data[chunksize:train_data_len]
    test_y = train_labels[chunksize:train_data_len]

    classifier.fit(train_x, train_y)
    predicted = classifier.predict(test_x)
    print("Test Data Results:")
    print("Test Accuracy: ", accuracy_score(predicted, test_y))
    X = classification_report(test_y, predicted, output_dict=True)
    #print (X.keys())
    print("Sensitivity: ", X['1']['recall'])
    print("Specificity: ", X['0']['recall'])
    print("MCC: ", mcc(test_y, predicted))
    print("")
def ablation(strategy):
    model = RandomForestClassifier(n_estimators=50, oob_score=True)
    assert strategy in ["random", "important"]

    x_train = np.array(x).copy()
    y_train = np.array(y).copy()
    mccs = []
    for feat in range(len(x[0])):

        _ = model.fit(x_train, y_train)

        if strategy == "random":
            x_train = np.delete(x_train,
                                np.random.randint(len(x_train[0])),
                                axis=1)
        elif strategy == "important":
            x_train = np.delete(x_train,
                                np.argmax(model.feature_importances_),
                                axis=1)
        else:
            continue

        mccs += [mcc(np.argmax(model.oob_decision_function_, axis=1), y)]

    return mccs
Пример #3
0
def compute_metrics(preds: np.ndarray, labels: np.ndarray) -> Dict[str, float]:
    # noinspection PyUnresolvedReferences
    acc_score = (preds == labels).mean()
    mcc_score = mcc(labels, preds)
    tot_samp = preds.shape[0]
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    tn, fp, fn, tp = (tn / tot_samp).round(2), (fp / tot_samp).round(2), (fn / tot_samp).round(2), (
            tp / tot_samp).round(2)
    return {"acc": acc_score, "mcc": mcc_score, "tn": tn, "fp": fp, "fn": fn, "tp": tp}
def evaluate(pred_vals, true_vals, pred_prob):
    precision, recall, thresholds = metrics.precision_recall_curve(
        true_vals, pred_prob)
    return [
        mcc(true_vals, pred_vals),
        metrics.f1_score(true_vals, pred_vals),
        metrics.precision_score(true_vals, pred_vals),
        ac(true_vals, pred_vals),
        metrics.roc_auc_score(true_vals, pred_prob),
        metrics.auc(recall, precision)
    ]
Пример #5
0
def calc_mcc(yval, yval_rk):
    ycat = np.zeros(yval_rk.shape[0])
    x = 0
    for val in yval_rk:
        index = np.argmax(val)
        ycat[x] = index
        x += 1
    yreal = np.zeros(ycat.shape)
    for i in range(yval.shape[0]):
        yreal[i] = np.argmax(yval[i])
    #print(yreal)
    #print(ycat)
    #print('MCC: ', str(mcc(yreal, ycat)))
    return mcc(yreal,ycat)
Пример #6
0
def reportStats(weight, current_iteration, X_train, y_train, X_test, y_test):

    y_train[y_train < 0] = 0
    y_test[y_test < 0] = 0

    ypred_is = predict_all(X_train, weight)
    ypred_oos = predict_all(X_test, weight)

    np_err_handling = np.seterr(invalid='ignore')

    is_acc = acc(y_train, ypred_is)
    is_mcc = mcc(y_train, ypred_is)
    is_f1 = f1(y_train, ypred_is)
    is_mse = mse(y_train, ypred_is)

    oos_acc = acc(y_test, ypred_oos)
    oos_mcc = mcc(y_test, ypred_oos)
    oos_f1 = f1(y_test, ypred_oos)
    oos_mse = mse(y_test, ypred_oos)

    is_tn, is_fp, is_fn, is_tp = confusion_matrix(y_train, ypred_is).ravel()
    oos_tn, oos_fp, oos_fn, oos_tp = confusion_matrix(y_test,
                                                      ypred_oos).ravel()
    is_auprc = auprc(y_train, ypred_is)
    oos_auprc = auprc(y_test, ypred_oos)

    np.seterr(**np_err_handling)

    print(
        f"Consensus {current_iteration}: IS acc {is_acc:0.5f}.  IS MCC {is_mcc:0.5f}.  IS F1 {is_f1:0.5f}.  IS MSE {is_mse:0.5f}.  OOS acc {oos_acc:0.5f}.  OOS MCC {oos_mcc:0.5f}.  OOS F1 {oos_f1:0.5f}.  OOS MSE {oos_mse:0.5f}."
    )
    print(
        f"Confusion {current_iteration}: IS TP: {is_tp}, IS FP: {is_fp}, IS TN: {is_tn}, IS FN: {is_fn}, IS AUPRC: {is_auprc:0.5f}.  OOS TP: {oos_tp}, OOS FP: {oos_fp}, OOS TN: {oos_tn}, OOS FN: {oos_fn}, OOS AUPRC: {oos_auprc:0.5f}."
    )

    return is_acc, is_mcc, is_f1, is_mse, is_auprc, oos_acc, oos_mcc, oos_f1, oos_mse, oos_auprc
def run_growth(x, y):

    model = RandomForestClassifier(n_estimators=50, n_jobs=10, oob_score=True)

    _x = x.copy()
    _y = y.copy()

    index = np.random.choice(np.where(_y == 0)[0])

    X_training = _x[index].copy()
    Y_training = np.array([_y[index]])

    _x = np.delete(_x, index, 0)
    _y = np.delete(_y, index)

    index = np.random.choice(np.where(_y == 1)[0])

    X_training = np.vstack((X_training, _x[index]))
    Y_training = np.append(Y_training, _y[index])

    _x = np.delete(_x, index, 0)
    _y = np.delete(_y, index)

    mccs = []

    for _ in range(len(_x)):

        _ = model.fit(X_training, Y_training)

        mccs += [
            mcc(np.argmax(model.oob_decision_function_, axis=1), Y_training)
        ]

        index = np.random.randint(len(_x))

        X_training = np.vstack((X_training, _x[index]))
        Y_training = np.append(Y_training, _y[index])

        _x = np.delete(_x, index, 0)
        _y = np.delete(_y, index)

        #print str(len(_x)) + "\t" + str(mccs2[-1])

    return mccs
def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true)  # number of positive
    numn = n - nump  # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    prev_proba = -1
    best_proba = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx < i are predicted negative while others are predicted positive
        # only evaluate mcc when probability changes
        proba = y_prob[idx[i]]
        if proba != prev_proba:
            prev_proba = proba
            new_mcc = mcc(tp, tn, fp, fn)
            if new_mcc >= best_mcc:
                best_mcc = new_mcc
                best_id = i
                best_proba = proba
        mccs[i] = new_mcc
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
    if show:
        y_pred = (y_prob >= best_proba).astype(int)
        score = matthews_corrcoef(y_true, y_pred)
        print(score, best_mcc)

        return best_proba, best_mcc, y_pred
    else:
        return best_mcc
Пример #9
0
def confusion_matrix(y_true, y_pred, decoder):
    revsere_decoder_index = {
        value: key
        for key, value in decoder.word_index.items()
    }
    space_inx = decoder.word_index["-"]
    y = np.argmax(y_true, axis=-1)
    y_ = np.argmax(y_pred, axis=-1)
    mask1 = np.greater(y, 0)
    mask2 = np.not_equal(y, space_inx)
    mask = np.logical_and(mask1, mask2)

    nclass = len(revsere_decoder_index) + 1
    mat = np.zeros([nclass, nclass], dtype=int)
    ym = y[mask]
    y_m = y_[mask]

    for i in range(len(ym)):
        mat[ym[i], y_m[i]] += 1
    sum_class_true = np.sum(mat, axis=-1)
    sum_class_pred = np.sum(mat, axis=0)
    sum_all = np.sum(sum_class_true)
    acc = np.sum(np.diagonal(mat / sum_all))
    add_epsilon = lambda x: x + 1e-10 if x == 0 else x
    div1 = sum_class_true.reshape(sum_class_true.shape[0], 1)
    div1 = np.apply_along_axis(func1d=add_epsilon, axis=1, arr=div1)
    div2 = sum_class_pred.reshape(1, sum_class_true.shape[0])
    div2 = np.apply_along_axis(func1d=add_epsilon, axis=0, arr=div2)
    recall = np.diagonal(mat / div1)
    precision = np.diagonal(mat / div2)
    add_epsilon = lambda x: np.where(x == 0.0, x + 1e-10, x)
    freq = sum_class_true / sum_all
    div3 = np.apply_along_axis(add_epsilon, axis=0, arr=recall)
    div4 = np.apply_along_axis(add_epsilon, axis=0, arr=precision)

    div5 = (1 / div3 + 1 / div4)
    f_score = 2 / div5
    mat = mat
    m = mcc(ym, y_m)
    return mat, recall, precision, f_score, acc, freq, m
Пример #10
0
def matthews_correlation(y_true, y_pred):
    '''Calculates the Matthews correlation coefficient measure for quality
    of binary classification problems.
    
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())
    '''
    return mcc(y_true, y_pred)
Пример #11
0
def matthews_corrcoef(preds, labels):
    """
    Matthew's correlation coefficient

    .. note::
        The implementation from ``sklearn.metrics`` is used to compute the score.

    Parameters
    ----------
    preds : list or numpy.ndarray
        A list of predictions from a model
    labels : list or numpy.ndarray
        A list of ground truth labels with the same number of elements as
        ``preds``

    Returns
    -------
    mcc_score : float
        Matthew's correlation coefficient of the model
    """
    preds = _numpyfy(preds)
    labels = _numpyfy(labels)
    return mcc(preds, labels)
Пример #12
0
def evaluate(logits, labels):
    all_targets = []
    all_probs_0 = []
    all_probs_1 = []
    all_probs_2 = []
    all_probs_3 = []

    for i in range(len(logits)):
        probs = torch.nn.Softmax(dim=0)(logits[i]).detach().cpu().numpy()
        all_probs_0.extend(probs[0].ravel())
        all_probs_1.extend(probs[1].ravel())
        all_probs_2.extend(probs[2].ravel())
        all_probs_3.extend(probs[3].ravel())

        target = labels[i].numpy()

        all_targets.append(target.ravel())

    all_probs_np = np.stack([all_probs_0, all_probs_1, all_probs_2, all_probs_3], axis=1)
    all_preds_np = np.argmax(all_probs_np, axis=1)
    all_targets_np = np.hstack(all_targets)

    return f1_score(all_targets_np, all_preds_np,average='weighted'), mcc(all_targets_np, all_preds_np)

# def evaluate(logits, labels, n_classes, ignore_index = -100, fast=True):
#
#     all_probs_0 = []
#     all_targets = []
#
#
#     if n_classes == 4:
#         all_probs_1 = []
#         all_probs_2 = []
#         all_probs_3 = []
#
#     act = torch.sigmoid if n_classes==1 else torch.nn.Softmax(dim=0)
#
#     for i in range(len(logits)):
#         # prediction = act(logits[i]).detach().cpu().numpy()[-1]  # this takes last channel in multi-class, ok for 2-class
#         # logits[i] is n_classes x h x w
#         prob = act(logits[i]).detach().cpu().numpy()  # prob is n_classes x h x w
#         target = labels[i].cpu().numpy()
#
#         if n_classes==1:
#             all_probs_0.extend(prob.ravel())
#         else:
#             all_probs_0.extend(prob[0].ravel())
#             all_probs_1.extend(prob[1].ravel())
#             all_probs_2.extend(prob[2].ravel())
#             all_probs_3.extend(prob[3].ravel())
#
#         all_targets.append(target.ravel())
#
#     if n_classes == 1: all_probs_np = np.hstack(all_probs_0)
#     else: all_probs_np = np.stack([all_probs_0, all_probs_1, all_probs_2, all_probs_3], axis=1)
#
#     all_targets_np = np.hstack(all_targets)
#
#     all_probs_np = all_probs_np[all_targets_np != ignore_index]
#     all_targets_np = all_targets_np[all_targets_np!=ignore_index]
#
#     if n_classes == 4:
#         all_preds_np = np.argmax(all_probs_np, axis=1)
#         return roc_auc_score(all_targets_np, all_probs_np, multi_class='ovo',average='weighted'), f1_score(all_targets_np, all_preds_np,average='weighted')
#     else:
#         all_preds_np = all_probs_np > 0.5
#         if fast==True:
#             return fast_auc(all_targets_np>0.5, all_probs_np), f1_score(all_targets_np>0.5, all_preds_np)
#         else:
#             # return roc_auc_score(all_targets_np, all_probs_np), f1_score(all_targets_np, all_preds_np)
Пример #13
0
    def qualitativeValidation(self):
        ''' performs validation for qualitative models '''

        # Make a copy of the original matrices
        X = self.X.copy()
        Y = self.Y.copy()

        # Get predicted classes.
        Yp = self.estimator.predict(X)

        if len(Yp) != len(Y):
            raise Exception('Lenght of experimental and predicted Y'
                            'do not match')

        info = []

        # Get confusion matrix for predicted Y
        try:
            self.TNpred, self.FPpred,\
            self.FNpred, self.TPpred = confusion_matrix(Y, Yp,
                                                     labels=[0, 1]).ravel()
            self.sensitivityPred = (self.TPpred / (self.TPpred + self.FNpred))
            self.specificityPred = (self.TNpred / (self.TNpred + self.FPpred))
            self.mccp = mcc(Y, Yp)

            info.append(('TPpred', 'True positives', self.TPpred))
            info.append(('TNpred', 'True negatives', self.TNpred))
            info.append(('FPpred', 'False positives', self.FPpred))
            info.append(('FNpred', 'False negatives', self.FNpred))
            info.append(('SensitivityPed', 'Sensitivity in fitting',
                         self.sensitivityPred))
            info.append(('SpecificityPred', 'Specificity in fitting',
                         self.specificityPred))
            info.append(
                ('MCCpred', 'Matthews Correlation Coefficient', self.mccp))
            LOG.debug('Computed class prediction for estimator instances')
        except Exception as e:
            LOG.error(f'Error computing class prediction of Yexp'
                      f'with exception {e}')
            raise e

        # Get cross-validated Y
        try:
            y_pred = cross_val_predict(self.estimator,
                                       X,
                                       Y,
                                       cv=self.cv,
                                       n_jobs=-1)
        except Exception as e:
            LOG.error(f'Cross-validation failed with exception'
                      f'exception {e}')
            raise e
        # Get confusion matrix
        try:
            self.TN, self.FP, self.FN, self.TP = confusion_matrix(
                Y, y_pred, labels=[0, 1]).ravel()
        except Exception as e:
            LOG.error(f'Failed to compute confusion matrix with'
                      f'exception {e}')
            raise e
        try:
            self.sensitivity = (self.TP / (self.TP + self.FN))
        except Exception as e:
            LOG.error(f'Failed to compute sensibility with' f'exception {e}')
            self.sensitivity = '-'
        try:
            self.specificity = (self.TN / (self.TN + self.FP))
        except Exception as e:
            LOG.error(f'Failed to compute specificity with' f'exception {e}')
            self.specificity = '-'
        try:
            # Compute Matthews Correlation Coefficient
            self.mcc = (((self.TP * self.TN) - (self.FP * self.FN)) / np.sqrt(
                (self.TP + self.FP) * (self.TP + self.FN) *
                (self.TN + self.FP) * (self.TN + self.FN)))
        except Exception as e:
            LOG.error(f'Failed to compute Mathews Correlation Coefficient'
                      f'exception {e}')
            self.mcc = '-'

        info.append(('TP', 'True positives in cross-validation', self.TP))
        info.append(('TN', 'True negatives in cross-validation', self.TN))
        info.append(('FP', 'False positives in cross-validation', self.FP))
        info.append(('FN', 'False negatives in cross-validation', self.FN))

        info.append(('Sensitivity', 'Sensitivity in cross-validation',
                     self.sensitivity))
        info.append(('Specificity', 'Specificity in cross-validation',
                     self.specificity))
        info.append(
            ('MCC', 'Matthews Correlation Coefficient in cross-validation',
             self.mcc))
        info.append(('Y_adj', 'Adjusted Y values', Y))
        info.append(('Y_adj', 'Adjusted Y values', Yp))
        info.append(
            ('Y_pred', 'Predicted Y values after cross-validation', y_pred))
        LOG.debug(f'Qualitative crossvalidation performed')

        results = {}
        results['quality'] = info
        results['Y_adj'] = Yp
        results['Y_pred'] = y_pred
        return True, results
Пример #14
0
def random_forest_training(X,
                           y,
                           stratify_array,
                           experiment_folder_path,
                           train_test_splits=TRAIN_TEST_SPLIT_RUN,
                           cv_nsplits=CV_NSPLITS,
                           cv_repeats=CV_REPEATS):
    """"""

    for train_test_split_run in range(train_test_splits):
        mcc_scores = []
        acc_scores = []

        # Create the folder for the current experiment
        train_test_run_folder_path = os.path.join(
            experiment_folder_path, '{}'.format(train_test_split_run))
        os.makedirs(train_test_run_folder_path, exist_ok=True)

        feat_rankings_folder = os.path.join(train_test_run_folder_path,
                                            'features_importance')
        os.makedirs(feat_rankings_folder, exist_ok=True)

        X_tr, X_ts, y_tr, y_ts, S_tr, S_ts = splt(
            X,
            y,
            stratify_array,
            test_size=0.2,
            random_state=train_test_split_run,
            stratify=stratify_array)

        print('Experiment {} out of {} ...'.format(train_test_split_run + 1,
                                                   train_test_splits),
              end=' ')

        rskf_ = rskf(n_splits=cv_nsplits,
                     n_repeats=cv_repeats,
                     random_state=42)
        cv_exp_number = 1
        for train_index, val_index in rskf_.split(X_tr, S_tr):
            X_train, X_val = X_tr[train_index], X_tr[val_index]
            y_train, y_val = y_tr[train_index], y_tr[val_index]
            forest = rfc(n_estimators=1000, n_jobs=-1)
            forest.fit(X_train, y_train)
            y_pred_val = forest.predict(X_val)
            mc = mcc(y_val, y_pred_val)
            ac = acc(y_val, y_pred_val)
            mcc_scores.append(mc)
            acc_scores.append(ac)

            # Save Feature ranking
            np.savez(os.path.join(
                feat_rankings_folder,
                'feat_ranking_{:02d}.npz'.format(cv_exp_number)),
                     ranking=forest.feature_importances_)

            rf_pickle_filepath = os.path.join(
                train_test_run_folder_path,
                'forest_{:02d}.pkl'.format(cv_exp_number))
            with open(rf_pickle_filepath, 'wb') as pickle_file:
                pickle.dump(forest, pickle_file)
            cv_exp_number += 1

        # Re-train everything from scratch on the entire training set
        forest = rfc(n_estimators=1000, n_jobs=-1)
        forest.fit(X_tr, y_tr)
        y_ts_our = forest.predict(X_ts)

        mc = mcc(y_ts, y_ts_our)
        ac = acc(y_ts, y_ts_our)

        rf_pickle_filepath = os.path.join(
            train_test_run_folder_path,
            'forest_training.pkl'.format(cv_exp_number))
        with open(rf_pickle_filepath, 'wb') as pickle_file:
            pickle.dump(forest, pickle_file)

        # Store the logs for this experiment

        log_file_path = os.path.join(train_test_run_folder_path, 'log.csv')
        mcc_ci_min, mcc_ci_max = bootstrap_ci(np.asarray(mcc_scores))
        acc_ci_min, acc_ci_max = bootstrap_ci(np.asarray(acc_scores))

        scores = pd.DataFrame(
            {
                'ACC': np.mean(acc_scores),
                'ACC_CI_MIN': acc_ci_min,
                'ACC_CI_MAX': acc_ci_max,
                'MCC': np.mean(mcc_scores),
                'MCC_CI_MIN': mcc_ci_min,
                'MCC_CI_MAX': mcc_ci_max,
                'ACC_TEST': ac,
                'MCC_TEST': mc
            },
            index=[0])
        scores.to_csv(log_file_path, sep=',')
        print('Done')
Пример #15
0
    def optimize(self, X, Y, estimator, tune_parameters):
        ''' optimizes a model using a grid search over a range of values for diverse parameters'''

        print("Optimizing PLS-DA algorithm")
        latent_variables = tune_parameters["n_components"]
        mcc_final = 0
        estimator0 = ""
        list_latent = []
        for n_comp in latent_variables:
            mcc0 = 0
            estimator.set_params(**{"n_components": n_comp})
            y_pred = cross_val_predict(estimator, X, Y, cv=self.cv, n_jobs=1)
            estimator1 = ""
            threshold_1 = 0
            for threshold in range(0, 100, 5):
                threshold = threshold / 100
                y_pred2 = copy.copy(y_pred)
                y_pred2[y_pred2 < threshold] = 0
                y_pred2[y_pred2 >= threshold] = 1
                mcc1 = mcc(Y, y_pred2)

                if mcc1 >= mcc0:
                    mcc0 = mcc1
                    estimator1 = copy.copy(estimator)
                    estimator1.set_params(**{'threshold': threshold})
                    threshold_1 = (threshold)

            if mcc0 >= mcc_final:
                mcc_final = mcc0
                estimator0 = copy.copy(estimator1)
                self.estimator = estimator0

            list_latent.append([n_comp, threshold_1, mcc0])

        print("MCC per lantent variable at best cutoff")
        for el in list_latent:
            print("Number of latent variables: %s \nBest cutoff: %s \nMCC: %s\n" %
                  (el[0], el[1], el[2]))

        self.estimator = estimator0
        self.estimator.fit(X, Y)
        print(self.estimator.get_params())


#### Overriding of parent methods

    # def CF_quantitative_validation(self):
    #     ''' performs validation for conformal quantitative models '''

      

    # def CF_qualitative_validation(self):
    #     ''' performs validation for conformal qualitative models '''


    # def quantitativeValidation(self):
    #     ''' performs validation for quantitative models '''

    # def qualitativeValidation(self):
    #     ''' performs validation for qualitative models '''


    # def validate(self):
    #     ''' Validates the model and computes suitable model quality scoring values'''


    # def optimize(self, X, Y, estimator, tune_parameters):
    #     ''' optimizes a model using a grid search over a range of values for diverse parameters'''


    # def regularProject(self, Xb, results):
    #     ''' projects a collection of query objects in a regular model, for obtaining predictions '''


    # def conformalProject(self, Xb, results):
    #     ''' projects a collection of query objects in a conformal model, for obtaining predictions '''


    # def project(self, Xb, results):
    #     ''' Uses the X matrix provided as argument to predict Y'''
def evaluate_partitions(keep_bin_edges, df_processed):
    """ This function evaluates a lightweight classifier according to the thresholds.
        Inputs are a list of bin-edges for the continuous target and the processed df.
    """
    # initialize the empty lists
    accs = []
    aucs = []
    mccs = []
    apcs = []

    accs_control = []
    aucs_control = []
    mccs_control = []
    apcs_control = []

    threshs = []
    bin_pct = []

    # starting data percentile
    pct = 0.0
    # binning parameters fixed - DO NOT CHANGE
    num_bins = 10
    num_trials = 10
    # sweep through all bin edges
    for bin_edge in keep_bin_edges:

        threshold = bin_edge
        # obtain the X,y matrices
        X, X_control, y = partition_data(df_processed, threshold)
        # starting data percentile
        pct += 1 / num_bins
        for trial in range(num_trials):
            # get the training, testing, and control data-sets
            x_train_idf, y_train, x_test_idf, y_test, x_control_idf = split_transform_data(
                X, X_control, y)
            # fit the classifier
            clf = ComplementNB(alpha=0.1,
                               class_prior=None,
                               fit_prior=True,
                               norm=False)
            clf.fit(x_train_idf, y_train)

            # evaluate on test and control sets
            accs.append(clf.score(x_test_idf, y_test))
            accs_control.append(clf.score(x_control_idf, y))

            y_pred = clf.predict(x_test_idf)
            y_pred_cont = clf.predict(x_control_idf)

            mccs.append(mcc(y_test, y_pred))
            mccs_control.append(mcc(y, y_pred_cont))

            y_proba = clf.predict_proba(x_test_idf)
            y_cont_proba = clf.predict_proba(x_control_idf)

            aucs.append(roc_auc_score(y_test, y_proba[:, 1]))
            aucs_control.append(roc_auc_score(y, y_cont_proba[:, 1]))

            apcs.append(apscore(y_test, y_proba[:, 1]))
            apcs_control.append(apscore(y, y_cont_proba[:, 1]))

            threshs.append(threshold)
            bin_pct.append(pct)

    # populate into a df for downstream analysis
    df_eval = pd.DataFrame()
    df_eval['data percentile'] = bin_pct  # data percentile
    df_eval['threshold'] = threshs  # bin edge
    df_eval['test accuracy'] = accs  # accuracy
    df_eval['test mcc'] = mccs  # matthews correlation coefficient
    df_eval['test auc'] = aucs  # roc-auc
    df_eval['test ap'] = apcs  # average precision
    df_eval['control accuracy'] = accs_control
    df_eval['control mcc'] = mccs_control
    df_eval['control auc'] = aucs_control
    df_eval['control ap'] = apcs_control

    return df_eval
Пример #17
0
    def fit(self, epoch, train_loader, verbose=True):
        X_train = []
        y_train = []
        cluster_ids_train = []
        for batch_idx, (data, y) in enumerate(train_loader):
            batch_size = data.size()[0]
            data = data.view(batch_size, -1).to(self.device)

            # Collect training data and labels for the later classifier
            X_train.append(data.cpu().numpy())
            y_train.extend(y.numpy())

            # Get the latent features
            with torch.no_grad():
                latent_X = self.autoencoder(data, latent=True)
                latent_X = latent_X.cpu().numpy()

            if self.args.clustering == "cac":
                cluster_id = self.clustering.cluster(latent_X, y,
                                                     self.args.beta,
                                                     self.args.alpha)

            else:
                # [Step-1] Update the assignment results
                cluster_id = self.clustering.update_assign(latent_X, y)

                # [Step-2] Update cluster centers in batch Clustering
                elem_count = np.bincount(cluster_id,
                                         minlength=self.args.n_clusters)

                for k in range(self.args.n_clusters):
                    # avoid empty slicing
                    if elem_count[k] == 0:
                        continue
                    # updating the cluster center
                    self.clustering.update_cluster(latent_X[cluster_id == k],
                                                   k)

            # [Step-3] Update the network parameters
            loss, rec_loss, dist_loss = self._loss(data, cluster_id)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            #             if verbose and (batch_idx+1) % self.args.log_interval == 0:
            msg = 'Epoch: {:02d} | Batch: {:03d} | Loss: {:.3f} | Rec-' \
                  'Loss: {:.3f} | Dist-Loss: {:.3f}'
            print(
                msg.format(epoch, batch_idx + 1,
                           loss.detach().cpu().numpy(), rec_loss, dist_loss))

        X_train = np.vstack(X_train)
        self.eval()
        if self.args.clustering == "cac":
            with torch.no_grad():
                latent_X_train = self.autoencoder(torch.FloatTensor(
                    np.array(X_train)).to(self.args.device),
                                                  latent=True)
                latent_X_train = latent_X_train.to(self.args.device).numpy()

            cluster_ids_train = self.clustering.update_assign(latent_X_train)
            y_train = np.array(y_train)
            X_train = latent_X_train

            print("Training Base classifier")
            classifier = self.get_classifier(self.classifier)
            classifier.fit(X_train, y_train)
            self.base_classifier.append(classifier)
            print("Base Training F1:",
                  f1_score(y_train,
                           classifier.predict(X_train).ravel()))
            print("Base Training MCC:",
                  mcc(y_train,
                      classifier.predict(X_train).ravel()))
            print(
                "Base Training AUC:",
                roc_auc_score(y_train,
                              classifier.predict_proba(X_train)[:, 1]))

            print("Training CAC classifiers")

            self.cluster_classifiers.append([])
            y_pred = []
            y_true = []
            y_pred_proba = []
            for j in range(self.args.n_clusters):
                cluster_indices = np.where(cluster_ids_train == j)[0]
                X_cluster = X_train[cluster_indices]
                y_cluster = y_train[cluster_indices]
                y_true.extend(y_cluster)
                classifier = self.get_classifier(self.classifier)
                if np.unique(y_cluster).shape[0] > 1:
                    classifier.fit(X_cluster, y_cluster.ravel())
                    print("CAC Training F1:",
                          f1_score(y_cluster, classifier.predict(X_cluster)))
                    print("CAC Training MCC:",
                          mcc(y_cluster, classifier.predict(X_cluster)))
                    print(
                        "CAC Training AUC:",
                        roc_auc_score(
                            y_cluster,
                            classifier.predict_proba(X_cluster)[:, 1]))
                    y_pred.extend(classifier.predict(X_cluster))
                    y_pred_proba.extend(
                        classifier.predict_proba(X_cluster)[:, 1])
                else:
                    print("Fitting random classifier, Iteration:", j)
                    tmp = np.random.randint(2, size=y_cluster.shape[0])
                    y_pred.extend(tmp)
                    y_pred_proba.extend(tmp)
                    classifier.fit(X_cluster, tmp)
                self.cluster_classifiers[-1].append(classifier)
            print("Final CAC Training F1:", f1_score(y_true, y_pred))
            print("Final CAC Training MCC:", mcc(y_true, y_pred))
            print("Final CAC Training AUC:",
                  roc_auc_score(y_true, y_pred_proba))
Пример #18
0
def matthews_corrcoef(preds, labels):
    preds = numpyfy(preds)
    labels = numpyfy(labels)
    return mcc(preds, labels)
Пример #19
0
def param_search_lgb(
    params: dict,
    n_iter: int,
    X_train,
    y_train,
    cv=None,
    learner_n_jobs: int = -1,
    search_n_jobs: int = 1,
    X_test=None,
    y_test=None,
    device="cpu",
    cv_filename="cv_results.h5",
    params_filename="params.txt",
    **kwargs,
) -> dict:
    """Assissted param search for lightgbm classifier.

    Holds max_depth iterates through num_leaves, then performace random search
    for each (max_depth, num_leaves) combo.

    This is useful because the relationship num_leaves < 2**max_depth should
    hold. If blindly using a random search, the pair selected may violate
    this condition.

    The method writs cv results into a HDF5 file, indexed by num_leave_n
    keys, where n is the parameter used for num_leaves.

    It also writes the best params into a text file.

    Parameters
    ----------
    params : dict
        Random search parameter dict. Must have {'max_depth', 'num_leaves'}.
    n_iter : int
        number of searches
    X_train : TYPE

    y_train : TYPE

    cv : None, optional
        cross valiation indices if given.
    learner_n_jobs : int, optional
        default -1, use all cpus for learner fitting
    search_n_jobs : int, optional
        default 1, use only 1 cpu for search. this is because by default the
        learner is allowed to use all CPUs already.
    X_test : None, optional
        test/valid data
    y_test : None, optional
        test/valid targets
    device : str, optional
        default 'cpu', can be either of {'cpu', 'gpu'}
    cv_filename : str, optional
        file to store cv results
    params_filename : str, optional
        file to store best params
    **kwargs
        kwargs passed to sklearn.RandomizedSearchCV

    Returns
    -------
    dict
    Keys:
    -----
    best_param:
        best parameters
    best_score:
        best achieved score
    best_learner:
        fitted best model
    """
    # some params
    max_depth = params.pop("max_depth")
    num_leaves = params.pop("num_leaves")
    assert max_depth is not None
    assert len(max_depth) == 1
    assert num_leaves is not None
    assert len(num_leaves) > 0

    max_depth = max_depth[0]
    out = dict()

    best_score = None
    best_params = None
    best_learner = None
    start = time.process_time()
    for n in num_leaves:
        print(f"max_depth = {max_depth}, num_leaves = {n}...")
        learner = lgb.LGBMClassifier(
            max_depth=max_depth,
            num_leaves=n,
            # boosting_type='gbdt',
            # objective='xentropy',
            # eval_metric='binary_logloss',
            # early_stopping_rounds=100,
            # verbose_eval=200,
            device=device,
            # verbosity=lgb_verbosity,
            n_jobs=learner_n_jobs,
        )

        rs = RandomizedSearchCV(
            learner,
            params,
            cv=cv,
            n_jobs=search_n_jobs,
            n_iter=n_iter,
            return_train_score=False,
            **kwargs,
        )
        # model_selection._search.format_results() sometimes has an bug and
        # returns nothing, causing ValueError when unpacking output.
        rs.fit(X_train, y_train, verbose=-1)

        if best_score is None:
            best_score = rs.best_score_
            best_learner = rs.best_estimator_

            best_params = rs.best_params_.copy()
            best_params["max_depth"] = max_depth
            best_params["num_leaves"] = n
        elif best_score < rs.best_score_:
            best_score = rs.best_score_
            best_learner = rs.best_estimator_

            best_params = rs.best_params_.copy()
            best_params["max_depth"] = max_depth
            best_params["num_leaves"] = n

        key = f"max_depth_{max_depth}_num_leaves_{n}"

        # store this search object for later use
        out[key] = rs

        # save cv scores
        if cv is not None:
            pd.DataFrame(rs.cv_results_).to_hdf(cv_filename, key=key, mode="a")

        # predict test set.
        if X_test is not None and y_test is not None:
            assert len(X_test) == len(y_test)
            y_pred = rs.predict(X_test)
            train_loss = rs.score(X_train, y_train)
            test_loss = rs.score(X_test, y_test)
            test_mcc = mcc(y_test, y_pred)

            msg = (f"{key}, Dev score: {train_loss:.3f}, Test score: " +
                   f"{test_loss:.3f}, Test MCC: {test_mcc:.3f}\n")
            print(msg)

            print(f"{key}, Save TSCV Best params = {best_params}")
            with open(params_filename, "a") as fp:
                fp.write(msg)
                fp.write(str(best_params))
                fp.write("\n")

    time_taken = time.process_time() - start
    print("Time taken (s): ", time_taken)

    # write final best param
    with open(params_filename, "a") as fp:
        # convert to python types for json writes
        # best_params = {k: np.asscalar(v) for k, v in best_params.items()}
        # fp.write(json.dumps(best_params))
        fp.write("Final result:\n")
        fp.write(str(best_params))
        fp.write("\n\n")

    out["best_params"] = best_params
    out["best_score"] = best_score
    out["best_learner"] = best_learner

    return out
Пример #20
0
X_t_2 = enc_t.transform(X_t_1)

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_t_1,
                                                            y,
                                                            random_state=0)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_t_2,
                                                            y,
                                                            random_state=0)

print('Accuracy on dataset converted from label to integer category:')

logistic_regression = LogisticRegression(random_state=0)
clf_lr = logistic_regression.fit(X_train_1, y_train_1)
acc_lr = clf_lr.score(X_test_1, y_test_1)
lr_pred = logistic_regression.predict(X_test_1)
lr_mcc = mcc(y_test_1, lr_pred)
print('logistic regression accuracy: {}'.format(acc_lr))
print('logistic regression MCC: {}'.format(lr_mcc))

naive_bayes = BernoulliNB()
clf_bnb = naive_bayes.fit(X_train_1, y_train_1)
acc_bnb = clf_bnb.score(X_test_1, y_test_1)
nb_pred = naive_bayes.predict(X_test_1)
nb_mcc = mcc(y_test_1, nb_pred)
print('naive bayes accuracy: {}'.format(acc_bnb))
print('naive bayes MCC: {}'.format(nb_mcc))

gradient_boosting = xgboost.XGBClassifier()
clf_xb = gradient_boosting.fit(X_train_1, y_train_1)
acc_xb = clf_xb.score(X_test_1, y_test_1)
gb_pred = gradient_boosting.predict(X_test_1)
Пример #21
0
def eval_model(model_type="ord", look_at_test_set=False, dropout=0):
    #Use the same dropout and number of epochs across models (initial cross-
    #validations were used to select)
    epochs = 40
    testscore = None
    xtrain, xval, ytrain, yval, xtest, ytest = load_data(model_type)
    if model_type == "rf":
        if look_at_test_set == True:
            model = RandomForestClassifier(n_estimators=1000,
                                           n_jobs=3,
                                           min_samples_split=35,
                                           min_samples_leaf=5,
                                           oob_score=True)
            xtrain = np.vstack([xtrain, xval])
            ytrain = np.concatenate([ytrain, yval])
            model.fit(xtrain[:, 0:180], ytrain, sample_weight=xtrain[:, -1])
            trainpreds, valpreds, testpreds = model.predict(xtrain[:,0:180]), model.predict(xval[:,0:180]),\
                                          model.predict(xtest[:,0:180])
        else:
            model = RandomForestClassifier(n_estimators=1000,
                                           n_jobs=3,
                                           min_samples_split=35,
                                           min_samples_leaf=5,
                                           oob_score=True)
            model.fit(xtrain[:, 0:180], ytrain, sample_weight=xtrain[:, -1])
            trainpreds, valpreds, testpreds = model.predict(xtrain[:,0:180]), model.predict(xval[:,0:180]),\
                                          model.predict(xtest[:,0:180])
    elif model_type == "enrich":
        model = enrichment_nn(dropout=dropout, input_dim=180, l2=0.0000)
        model.trainmod(xtrain,
                       epochs=epochs,
                       minibatch=250,
                       lr=0.005,
                       use_weights=True)
        trainpreds, valpreds, testpreds = model.predict(xtrain[:,0:180])[1], model.predict(xval[:,0:180])[1],\
                                          model.predict(xtest[:,0:180])[1]
        trainscore = r2_score(trainpreds, ytrain)
        valscore = r2_score(valpreds, yval)
        if look_at_test_set == True:
            testscore = r2_score(testpreds, ytest)
    else:
        model_dict = {
            "ord": ord_nn,
            "nom": nominal_classifier,
            "bin": bin_class_nn
        }
        dropout_dict = {"ord": 0.3, "nom": 0.3, "bin": 0.4}
        model = model_dict[model_type](dropout=dropout_dict[model_type],
                                       input_dim=180,
                                       l2=0.0000)
        if model_type == "bin" and look_at_test_set == True:
            xtrain = np.vstack([xtrain, xval])
            ytrain = np.concatenate([ytrain, yval])
        model.trainmod(xtrain,
                       epochs=epochs,
                       minibatch=250,
                       lr=0.005,
                       use_weights=True)
        trainpreds, valpreds, testpreds = model.predict(xtrain[:,0:180])[1], model.predict(xval[:,0:180])[1],\
                                          model.predict(xtest[:,0:180])[1]
    if model_type != "enrich":
        trainscore = mcc(trainpreds, ytrain)
        valscore = mcc(valpreds, yval)
        if look_at_test_set == True:
            testscore = mcc(testpreds, ytest)
    print_model_eval_results(trainscore,
                             valscore,
                             testscore,
                             model_description=model_type)
Пример #22
0
    def optimize(self, X, Y, estimator, tune_parameters):
        ''' optimizes a model using a grid search over a 
        range of values for diverse parameters'''

        LOG.info('Optimizing PLS-DA algorithm using local '
                 'implementation of gridsearch cv specially designed '
                 'for PLS discriminant analysis')
        # Max number of latent variables
        latent_variables = tune_parameters["n_components"]
        # Mathew correlation coefficient of best threshold
        mcc_final = 0
        estimator0 = ""
        # List to add the best threshold and Matthews correlation
        # coefficient for each number of latent variables
        list_latent = []
        try:
            for n_comp in latent_variables:
                mcc0 = 0
                estimator.set_params(**{"n_components": n_comp})
                y_pred = cross_val_predict(estimator,
                                           X,
                                           Y,
                                           cv=self.cv,
                                           n_jobs=1)
                estimator1 = ""
                threshold_1 = 0
                # Get optimum threshold
                for threshold in range(0, 100, 5):
                    threshold = threshold / 100
                    y_pred2 = copy.copy(y_pred)
                    y_pred2[y_pred2 < threshold] = 0
                    y_pred2[y_pred2 >= threshold] = 1
                    mcc1 = mcc(Y, y_pred2)
                    # Update threshold value with current best value
                    if mcc1 >= mcc0:
                        mcc0 = mcc1
                        estimator1 = copy.copy(estimator)
                        estimator1.set_params(**{'threshold': threshold})
                        threshold_1 = (threshold)
                # Assign class estimator the best current estimator
                if mcc0 >= mcc_final:
                    mcc_final = mcc0
                    estimator0 = copy.copy(estimator1)
                    self.estimator = estimator0

                list_latent.append([n_comp, threshold_1, mcc0])
        except Exception as e:
            LOG.error(f'Error optimizing PLS-DA with exception {e}')
            raise e

        LOG.debug('Number of latent variables, Best cutoff, and its Matthews '
                  'correlation coefficient')
        for lv in list_latent:
            LOG.debug(f'Number of latent variables: '
                      f'{lv[0]} \nBest cutoff: {lv[1]} \nMCC: {lv[2]}\n')

        self.estimator.fit(X, Y)
        LOG.info(f'Estimator best parameters: {self.estimator.get_params()}')


#### Overriding of parent methods

# def CF_quantitative_validation(self):
#     ''' performs validation for conformal quantitative models '''

# def CF_qualitative_validation(self):
#     ''' performs validation for conformal qualitative models '''

# def quantitativeValidation(self):
#     ''' performs validation for quantitative models '''

# def qualitativeValidation(self):
#     ''' performs validation for qualitative models '''

# def validate(self):
#     ''' Validates the model and computes suitable model quality scoring values'''

# def optimize(self, X, Y, estimator, tune_parameters):
#     ''' optimizes a model using a grid search over a range of values for diverse parameters'''

# def regularProject(self, Xb, results):
#     ''' projects a collection of query objects in a regular model, for obtaining predictions '''

# def conformalProject(self, Xb, results):
#     ''' projects a collection of query objects in a conformal model, for obtaining predictions '''

# def project(self, Xb, results):
#     ''' Uses the X matrix provided as argument to predict Y'''
    with open(save_dir_matFiles + 'name_loc_prob.pkl', 'wb') as fid:
        pickle.dump(name_loc_prob, fid)



    test_pred = np.around(test_pred)


    C_test = confusion_matrix(Y_test, test_pred)


    per_class_accuracy_test = np.diag(C_test.astype(np.float32)) / np.sum(C_test.astype(np.float32), axis=1)
    print(per_class_accuracy_test)

    print("testing mcc score:", mcc(Y_test, test_pred))
    print("testing F1 score:", f1(Y_test, test_pred))













Пример #24
0
#%%    
    # train classifiers random forest classifier
    # label: GC, data: third hop attribute
    print("-------TRAINING CLASSIFIERS-------")
#    second_hop_attri = cPickle.load(open(save_dir_matFiles+'second_hop_attributes.pkl', 'rb'))
    
    clf1 = RandomForestClassifier(n_jobs=int(multiprocessing.cpu_count()/2), verbose=0, class_weight='balanced',n_estimators=n_estimators_rf)
    label_flatten = GC.reshape((-1))
    
    if classifier == 1: # use first hop attribute only
        one_hop_attri_flatten = one_hop_attri.reshape((-1, 12))
        
        clf1.fit(one_hop_attri_flatten, label_flatten)
        predicted_train_label = clf1.predict(one_hop_attri_flatten)
        mcc_score = mcc(label_flatten, predicted_train_label)
        
    elif classifier == 2: # use second hop attributes only
        sec_hop_flatten = second_hop_attri.reshape((-1, 50))
        
        clf1.fit(sec_hop_flatten, label_flatten)
        predicted_train_label = clf1.predict(sec_hop_flatten)
        mcc_score = mcc(label_flatten, predicted_train_label)
        
    elif classifier == 3: # use third hop attribute only
        
        third_hop_flatten = third_hop_attri.reshape((-1, 150))
        
        clf1.fit(third_hop_flatten, label_flatten)
        predicted_train_label = clf1.predict(third_hop_flatten)
        mcc_score = mcc(label_flatten, predicted_train_label)
Пример #25
0
    def external_validation(self):
        ''' when experimental values are available for the predicted compounds,
        run external validation '''

        ext_val_results = []

        # Ye are the y values present in the input file
        Ye = np.asarray(self.conveyor.getVal("ymatrix"))

        # For qualitative models, make sure the Y is qualitative as well
        if not self.param.getVal("quantitative"):
            qy, message = utils.qualitative_Y(Ye)
            if not qy:
                self.conveyor.setWarning(
                    f'No qualitative activity suitable for external validation "{message}". Skipping.'
                )
                LOG.warning(
                    f'No qualitative activity suitable for external validation "{message}". Skipping.'
                )
                return

        # there are four variants of external validation, depending if the method
        # if conformal or non-conformal and the model is qualitative and quantitative

        if not self.param.getVal("conformal"):

            # non-conformal
            if not self.param.getVal("quantitative"):

                # non-conformal & qualitative
                Yp = np.asarray(self.conveyor.getVal("values"))

                if Ye.size == 0:
                    raise ValueError("Experimental activity vector is empty")
                if Yp.size == 0:
                    raise ValueError("Predicted activity vector is empty")

                # the use of labels is compulsory to inform the confusion matrix that
                # it must return a 2x2 confussion matrix. Otherwise it will fail when
                # a single class is represented (all TP, for example)
                TN, FP, FN, TP = confusion_matrix(Ye, Yp, labels=[0,
                                                                  1]).ravel()

                # protect to avoid warnings in special cases (div by zero)
                MCC = mcc(Ye, Yp)

                if (TP + FN) > 0:
                    sensitivity = (TP / (TP + FN))
                else:
                    sensitivity = 0.0

                if (TN + FP) > 0:
                    specificity = (TN / (TN + FP))
                else:
                    specificity = 0.0

                ext_val_results.append(
                    ('TP', 'True positives in external-validation', float(TP)))
                ext_val_results.append(
                    ('TN', 'True negatives in external-validation', float(TN)))
                ext_val_results.append(
                    ('FP', 'False positives in external-validation',
                     float(FP)))
                ext_val_results.append(
                    ('FN', 'False negatives in external-validation',
                     float(FN)))
                ext_val_results.append(
                    ('Sensitivity', 'Sensitivity in external-validation',
                     float(sensitivity)))
                ext_val_results.append(
                    ('Specificity', 'Specificity in external-validation',
                     float(specificity)))
                ext_val_results.append(
                    ('MCC',
                     'Mattews Correlation Coefficient in external-validation',
                     float(MCC)))

            else:

                # non-conformal & quantitative
                Yp = np.asarray(self.conveyor.getVal("values"))

                if Ye.size == 0:
                    raise ValueError("Experimental activity vector is empty")
                if Yp.size == 0:
                    raise ValueError("Predicted activity vector is empty")

                Ym = np.mean(Ye)
                nobj = len(Yp)

                SSY0_out = np.sum(np.square(Ym - Ye))
                SSY_out = np.sum(np.square(Ye - Yp))
                scoringP = mean_squared_error(Ye, Yp)
                SDEP = np.sqrt(SSY_out / (nobj))
                if SSY0_out == 0:
                    Q2 = 0.0
                else:
                    Q2 = 1.00 - (SSY_out / SSY0_out)

                ext_val_results.append(('scoringP', 'Scoring P', scoringP))
                ext_val_results.append(
                    ('Q2', 'Determination coefficient in cross-validation',
                     Q2))
                ext_val_results.append(
                    ('SDEP', 'Standard Deviation Error of the Predictions',
                     SDEP))

            self.conveyor.addVal(ext_val_results, 'external-validation',
                                 'external validation', 'method', 'single',
                                 'External validation results')

        else:
            # conformal external validation

            if not self.param.getVal("quantitative"):

                # conformal & qualitative
                Yp = np.concatenate(
                    (np.asarray(self.conveyor.getVal('c0')).reshape(-1, 1),
                     np.asarray(self.conveyor.getVal('c1')).reshape(-1, 1)),
                    axis=1)

                if Ye.size == 0:
                    raise ValueError("Experimental activity vector is empty")
                if Yp.size == 0:
                    raise ValueError("Predicted activity vector is empty")

                c0_correct = 0
                c1_correct = 0
                not_predicted = 0
                c0_incorrect = 0
                c1_incorrect = 0

                Ye1 = []
                Yp1 = []
                for i in range(len(Ye)):
                    real = float(Ye[i])
                    predicted = Yp[i]
                    if predicted[0] != predicted[1]:
                        Ye1.append(real)
                        if predicted[0]:
                            Yp1.append(0)
                        else:
                            Yp1.append(1)

                        if real == 0 and predicted[0] == True:
                            c0_correct += 1
                        if real == 0 and predicted[1] == True:
                            c0_incorrect += 1
                        if real == 1 and predicted[1] == True:
                            c1_correct += 1
                        if real == 1 and predicted[0] == True:
                            c1_incorrect += 1
                    else:
                        not_predicted += 1
                MCC = mcc(Ye1, Yp1)
                TN = c0_correct
                FP = c0_incorrect
                TP = c1_correct
                FN = c1_incorrect
                coverage = float((len(Yp) - not_predicted) / len(Yp))

                try:
                    # Compute accuracy (% of correct predictions)
                    conformal_accuracy = (float(TN + TP) /
                                          float(FP + FN + TN + TP))
                except Exception as e:
                    LOG.error(f'Failed to compute conformal accuracy with'
                              f'exception {e}')
                    conformal_accuracy = '-'

                if (TP + FN) > 0:
                    sensitivity = (TP / (TP + FN))
                else:
                    sensitivity = 0.0
                if (TN + FP) > 0:
                    specificity = (TN / (TN + FP))
                else:
                    specificity = 0.0

                ext_val_results.append(
                    ('TP', 'True positives in external-validation', float(TP)))
                ext_val_results.append(
                    ('TN', 'True negatives in external-validation', float(TN)))
                ext_val_results.append(
                    ('FP', 'False positives in external-validation',
                     float(FP)))
                ext_val_results.append(
                    ('FN', 'False negatives in external-validation',
                     float(FN)))
                ext_val_results.append(
                    ('Sensitivity', 'Sensitivity in external-validation',
                     float(sensitivity)))
                ext_val_results.append(
                    ('Specificity', 'Specificity in external-validation',
                     float(specificity)))
                ext_val_results.append(
                    ('MCC',
                     'Mattews Correlation Coefficient in external-validation',
                     float(MCC)))
                ext_val_results.append(
                    ('Conformal_coverage',
                     'Conformal coverage in external-validation',
                     float(coverage)))
                ext_val_results.append(
                    ('Conformal_accuracy',
                     'Conformal accuracy in external-validation',
                     float(conformal_accuracy)))

                self.conveyor.addVal(ext_val_results, 'external-validation',
                                     'external validation', 'method', 'single',
                                     'External validation results')
            else:

                # conformal & quantitative
                Yp_lower = self.conveyor.getVal('lower_limit')
                Yp_upper = self.conveyor.getVal('upper_limit')

                mean_interval = np.mean(np.abs(Yp_lower) - np.abs(Yp_upper))
                inside_interval = (Yp_lower.reshape(-1, 1) <
                                   Ye) & (Yp_upper.reshape(-1, 1) > Ye)
                accuracy = len(inside_interval) / len(Ye)
                conformal_accuracy = float("{0:.2f}".format(accuracy))
                conformal_mean_interval = float(
                    "{0:.2f}".format(mean_interval))

                ext_val_results.append(
                    ('Conformal_mean_interval', 'Conformal mean interval',
                     conformal_mean_interval))
                ext_val_results.append(
                    ('Conformal_accuracy', 'Conformal accuracy',
                     conformal_accuracy))

                self.conveyor.addVal(ext_val_results, 'external-validation',
                                     'external validation', 'method', 'single',
                                     'External validation results')
Пример #26
0
    print("Testing model...")
    model.eval()
    for i in tqdm(range(nb_data_test)):
        out = model(th.Tensor(signals[None, i, :, :])).detach().numpy()

        auc_c0.add(out[None, 0, 0], target[None, i, 0])
        auc_c1.add(out[None, 0, 1], target[None, i, 1])
        auc_c2.add(out[None, 0, 2], target[None, i, 2])

        res[i, 0] = 1 if out[0, 0] > 0.5 else -1
        res[i, 1] = 1 if out[0, 1] > 0.5 else -1
        res[i, 2] = 1 if out[0, 2] > 0.5 else -1

    target = np.where(target == 1, 1, -1)

    mcc_canal_0 = mcc(target[:, 0], res[:, 0])
    mcc_canal_1 = mcc(target[:, 1], res[:, 1])
    mcc_canal_2 = mcc(target[:, 2], res[:, 2])

    print("\nMCC")
    print("Canal 0 : %d" % (mcc_canal_0,))
    print("Canal 1 : %d" % (mcc_canal_1,))
    print("Canal 2 : %d" % (mcc_canal_2,))

    print("\nROC AUC")
    print("Canal 0 %f" % (auc_c0.value()[0]))
    print("Canal 1 %f" % (auc_c1.value()[0]))
    print("Canal 2 %f" % (auc_c2.value()[0]))


Пример #27
0
            text_cols='min_toehold_sequence',
            label_cols='Toehold Rating',
            bs=128,
            backwards=True)
        # assign this data to the trained learner
        learn_cr.data = data_classify_testr
        # compute metrics
        preds, _, _ = learn_cr.get_preds(ordered=True, with_loss=True)
        roc_aucr = roc_auc_score(df_test['Toehold Rating'], preds[:, 1])
        acur, mccr = nuspeak.get_metrics(learn_cr, return_metrics=True)

        test_df['shufftok_class'] = test_df['shufftok_toehold'].apply(
            lambda x: pred_class(x, learn_cf))
        y_test_true = test_df['Toehold Rating']
        y_test_shufftok = test_df['shufftok_class']
        mcc_c1 = mcc(y_test_true, y_test_shufftok)
        c1_scores.append(
            accuracy_score(y_test_true, y_test_shufftok, normalize=True))
        mccs_c1.append(mcc_c1)

        test_df['shuffchar_class'] = test_df['scrambled_toehold'].apply(
            lambda x: pred_class(x, learn_cf))
        y_test_shuffchar = test_df['shuffchar_class']
        mcc_c2 = mcc(y_test_true, y_test_shuffchar)
        c2_scores.append(
            accuracy_score(y_test_true, y_test_shuffchar, normalize=True))
        mccs_c2.append(mcc_c2)

        scores.append((acuf + acur) / 2)
        aucs.append((roc_aucf + roc_aucr) / 2)
        mccs.append((mccf + mccr) / 2)
Пример #28
0
#view the results
#Usage: python results.py resultfile

import pandas as pd
import sys
from sklearn.metrics import accuracy_score, roc_auc_score as roc, auc, classification_report, balanced_accuracy_score, matthews_corrcoef as mcc
df = pd.read_csv(
    sys.argv[1],
    header=None,
    sep=',',
)
y_true = df[2]
y_pred = df[3]
y_predprob = df[4]
#print(y_test)
print("Accuracy:", accuracy_score(y_true, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_true, y_pred))
#print(auc(y_true, y_predprob))
print(classification_report(y_true, y_pred))
print("MCC:", mcc(y_true, y_pred))
print("ROC_AUC:", roc(y_true, y_predprob))