plt.xlabel("Pairwise squared distances in original space")
        plt.ylabel("Pairwise squared distances in projected space")
        plt.title("Pairwise distances distribution for n_components=%d" %
                n_components)
        cb = plt.colorbar()
        cb.set_label('Sample pairs counts')

        rates = projected_dists / dists
        print("Mean distances rate: %0.2f (%0.2f)"
            % (np.mean(rates), np.std(rates)))
        plt.savefig('Figs/02b_rp_pwdist_{}_{}'.format(data_name, n_components))

        plt.figure()
        plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param)
        plt.xlabel("Squared distances rate: projected / original")
        plt.ylabel("Distribution of samples pairs")
        plt.title("Histogram of pairwise distance rates for n_components=%d" %
                n_components)
        plt.savefig('Figs/02b_rp_histogram_{}_{}'.format(data_name, n_components))
        plt.clf()
        # TODO: compute the expected value of eps and add them to the previous plot
        # as vertical lines / region
 
np.random.seed(0)

digits = load_digits()
X, y = digits.data, digits.target
johnson_lindenstrauss(digits.data, 'digits')

X, y, data = getCreditCardData('./Data/ccdefault.xls', subset=0.2)
johnson_lindenstrauss(data, 'credit')
예제 #2
0
    ax.set_title(
        'Dimensionality Reduced Data for NN (using {})'.format(reducer_name))

    # For each number of components, find the best classifier results
    results = pd.DataFrame(search.cv_results_)
    results.to_csv(
        path_or_buf='Figs/05_{}_cluster_nn.csv'.format(reducer_name))
    best_clfs = results.groupby(components_col).apply(
        lambda g: g.nlargest(1, 'mean_test_score'))

    best_clfs.plot(x=components_col,
                   y='mean_test_score',
                   yerr='std_test_score',
                   legend=False,
                   ax=ax)
    ax.set_ylabel('Classification accuracy (val)')
    ax.set_xlabel('n_components')

    plt.tight_layout()
    plt.savefig('Figs/05_{}_cluster_nn'.format(reducer_name))


np.random.seed(0)
X, y, data = getCreditCardData('./Data/ccdefault.xls')

reducer = KMeans(random_state=0)
cluster_nn(X, y, reducer, 'kmeans')

reducer = custGMM(GaussianMixture(reg_covar=1.0, random_state=0))
cluster_nn(X, y, reducer, 'gaussian_mix')
예제 #3
0
    plt.plot(range(1, n + 1), test_mean_acc1, 'r')
    plt.fill_between(range(1, n + 1),
                     test_mean_acc1 - 1 * test_std_acc1,
                     test_mean_acc1 + 1 * test_std_acc1,
                     alpha=0.10)
    plt.plot(range(1, n + 1), train_mean_acc1, 'm')
    plt.fill_between(range(1, n + 1),
                     train_mean_acc1 - 1 * train_std_acc1,
                     train_mean_acc1 + 1 * train_std_acc1,
                     alpha=0.10)
    plt.legend(('Test Accuracy - {}'.format(data_name),
                'Training Accuracy - {}'.format(data_name)))
    plt.ylabel('Accuracy')
    plt.xlabel('Decision Tree Depth')
    plt.tight_layout()
    plt.savefig('Figs/DT-depth-{}'.format(data_name))
    plt.clf()


if __name__ == "__main__":
    np.random.seed(0)
    test_size = 0.2

    X_train1, X_test1, y_train1, y_test1 = getCreditCardData(
        path='./Data/ccdefault.xls', test_size=test_size)
    X_train2, X_test2, y_train2, y_test2 = getWineData(
        path='./Data/winequality-white.csv', test_size=test_size)

    DT(X_train1, X_test1, y_train1, y_test1, 'Credit Card Default', 0.8, 0.9)
    DT(X_train2, X_test2, y_train2, y_test2, 'Wine', 0.4, 1.01)