Пример #1
0
def test_defaults():
    rng = np.random.RandomState(123)
    x = rng.normal(loc=5., size=100)
    original, std_err, ci_bounds = bootstrap(x, func=np.mean, seed=123)
    assert round(original, 2) == 5.03
    assert round(std_err, 2) == 0.11
    assert round(ci_bounds[0], 2) == 4.80
    assert round(ci_bounds[1], 2) == 5.26
Пример #2
0
def test_defaults():
    rng = np.random.RandomState(123)
    x = rng.normal(loc=5., size=100)
    original, std_err, ci_bounds = bootstrap(x, func=np.mean, seed=123)
    assert round(original, 2) == 5.03
    assert round(std_err, 2) == 0.11
    assert round(ci_bounds[0], 2) == 4.80
    assert round(ci_bounds[1], 2) == 5.26
Пример #3
0
def classification_report(results_pickle, output_dir, categorical_encoder,
                          average_mechanism):
    """Generate classification report that gives results from classification tasks."""
    from mlxtend.evaluate import bootstrap
    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_curve, roc_auc_score, confusion_matrix
    os.makedirs(output_dir, exist_ok=True)

    def extract_ys(Y):
        return Y[:, 0], Y[:, 1:]

    def accuracy(Y):
        y_true, y_pred = extract_ys(Y)
        y_pred = np.argmax(y_pred, axis=1)
        return accuracy_score(y_true, y_pred)

    def recall(Y):
        y_true, y_pred = extract_ys(Y)
        y_pred = np.argmax(y_pred, axis=1)
        return recall_score(y_true, y_pred, average=average_mechanism)

    def precision(Y):
        y_true, y_pred = extract_ys(Y)
        y_pred = np.argmax(y_pred, axis=1)
        return precision_score(y_true, y_pred, average=average_mechanism)

    def f1(Y):
        y_true, y_pred = extract_ys(Y)
        y_pred = np.argmax(y_pred, axis=1)
        return f1_score(y_true, y_pred, average=average_mechanism)

    def auc(Y):
        y_true, y_pred = extract_ys(Y)
        y_pred_labels = np.argmax(y_pred, 1)
        supports = {
            i: sum((y_pred_labels[np.squeeze(y_true == i)] == i).astype(int))
            for i in np.unique(y_true)
        }
        final_auc_score = 0.
        for i in supports:
            final_auc_score += supports[i] * roc_auc_score(
                (y_true == i).astype(int),
                y_pred[:, int(i)],
                average='weighted')
        final_auc_score /= sum(list(supports.values()))
        return final_auc_score

    def to_y_probas(y_pred):
        return np.exp(y_pred) / np.exp(y_pred).sum(1)[:, np.newaxis]

    results_dict = pickle.load(open(results_pickle, 'rb'))
    if os.path.exists(categorical_encoder):
        categorical_encoder = pickle.load(open(categorical_encoder, 'rb'))
    else:
        categorical_encoder = None
    df_roc = []
    final_results = []
    for k in results_dict:
        y_true = results_dict[k]['y_true']
        y_true_labels = np.argmax(y_true, axis=1)[:, np.newaxis]
        classes = np.unique(y_true_labels)
        y_pred = to_y_probas(results_dict[k]['y_pred'])
        y_pred_labels = np.argmax(y_pred, 1).reshape((-1, 1))
        #print(y_true_labels, y_pred_labels)
        out_classes = classes.astype(
            int
        )  # if categorical_encoder == None else categorical_encoder.inverse_transform(classes.astype(int).reshape((-1,1)))
        class_labels = out_classes if categorical_encoder == None else categorical_encoder.inverse_transform(
            out_classes)
        pd.DataFrame(confusion_matrix(y_true_labels.astype(int).flatten(),
                                      y_pred_labels.astype(int).flatten(),
                                      labels=out_classes),
                     index=class_labels,
                     columns=class_labels).to_csv(
                         join(output_dir, '{}_confusion_mat.csv'.format(k)))
        Y = np.hstack((y_true_labels, y_pred))
        supports = {
            i: sum((y_pred_labels[np.squeeze(
                y_true_labels == i)] == i).astype(int))
            for i in classes
        }
        fpr = dict()
        tpr = dict()
        for i in supports:
            fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_pred[:, i])
        all_fpr = np.unique(np.concatenate([fpr[i] for i in supports]))
        mean_tpr = np.zeros_like(all_fpr)
        for i in supports:
            mean_tpr += supports[i] * np.interp(all_fpr, fpr[i], tpr[i])
        mean_tpr /= sum(list(supports.values()))
        tpr, fpr = mean_tpr, all_fpr
        df = pd.DataFrame({'fpr': fpr, 'tpr': tpr})
        df['Legend'] = "{} Weighted ROC, AUC={}".format(k, round(auc(Y), 2))
        df_roc.append(df)
        fns = dict(accuracy=accuracy,
                   recall=recall,
                   precision=precision,
                   f1=f1,
                   auc=auc)
        for fn in fns:
            print(k, fn)
            original, std_err, ci_bounds = bootstrap(Y,
                                                     num_rounds=1000,
                                                     func=fns[fn],
                                                     ci=0.95,
                                                     seed=123)
            low, high = ci_bounds
            final_results.append([k, fn, original, std_err, low, high])
    final_results = pd.DataFrame(final_results,
                                 columns=[
                                     'DataSet', 'Metric', 'Score', 'Error',
                                     '95% CI Low', '95% CI High'
                                 ])
    df_roc = pd.concat(df_roc)
    final_results.to_csv(join(output_dir, 'final_classification_results.csv'))
    df_roc.to_csv(join(output_dir, 'Weighted_ROC.csv'))