Exemplo n.º 1
0
def dr(X, y, savedir, ds):
    # First do pca
    pca_pipe = A3.pca(X, y)
    pca = pca_pipe.named_steps['pca']
    plt.savefig('{}/{}-pca.png'.format(savedir, ds))
    np.savetxt('{}/{}-pca-ev.csv'.format(savedir, ds), pca.explained_variance_)
    np.savetxt('{}/{}-pca-ev-ratio.csv'.format(savedir, ds),
               pca.explained_variance_ratio_)
    plt.close('all')
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('n_components')
    plt.ylabel('explained variance (%)')
    plt.savefig('{}/{}-pca-ev.png'.format(savedir, ds))
    plt.close('all')
    reconstruction_error = A3.recon_error(pca, X)
    logging.info('PCA reconstruction error: {}'.format(reconstruction_error))

    # second ICA
    ica = None
    max_kurtosis = -np.inf
    ica_range = range(10, X.shape[1], 10)
    kurt_per_comp = []
    for i in ica_range:
        ica_pipe = A3.ica(X, y, i)
        # This should be selected by kurtosis
        kurt = A3.avg_kurtosis(ica_pipe.transform(X))
        kurt_per_comp.append(kurt)
        logging.info('ICA {} average kurtosis: {}'.format(i, kurt))
        if kurt > max_kurtosis:
            ica = ica_pipe.named_steps['fastica']
            max_kurtosis = kurt
    logging.info('ICA max kurtosis {} with {} components'.format(
        max_kurtosis, ica.components_.shape[0]))
    plt.plot(ica_range, kurt_per_comp)
    plt.xlabel('n_components')
    plt.ylabel('mean kurtosis')
    plt.savefig('{}/{}-ica-kurtosis.png'.format(savedir, ds))

    # RP
    logging.info('Starting randomized projection...')
    rp_errors = []
    rp = None
    best_rp_err = np.inf
    reconstruction_errors = []
    for rp_run in range(10):
        logging.info('RP iteration {}'.format(rp_run))
        best_run = np.inf
        run_errors = []
        for i in range(10, X.shape[1], 10):
            rp_pipe = A3.random_projection(X, y, i)
            err = A3.recon_error(
                rp_pipe.named_steps['gaussianrandomprojection'], X)
            run_errors.append(err)
            logging.info('RP {} components reconstruction error: {}'.format(
                i, err))
            if err < best_rp_err:
                rp = rp_pipe.named_steps['gaussianrandomprojection']
                best_rp_err = err
            if err < best_run:
                best_run = err
        reconstruction_errors.append(run_errors)
        rp_errors.append(best_run)
    pd.DataFrame(reconstruction_errors,
                 columns=range(10, X.shape[1], 10)).to_csv(
                     '{}/{}-rp-reconstruction.csv'.format(savedir, ds))
    # Manually set random projection
    if ds == 'musk':
        rp.set_params(n_components=50)
    elif ds == 'shoppers':
        rp.set_params(n_components=30)

    plt.figure()
    plt.plot(range(10), rp_errors)
    plt.xlabel('iteration')
    plt.ylabel('reconstruction error')
    plt.savefig('{}/{}-rp-reconstruction.png'.format(savedir, ds))
    plt.close('all')
    logging.info('RP best n_components: {}'.format(rp.n_components_))

    # TODO: fourth dimension reduction
    rf_pipe = A3.rfselect(X, y)
    rf = rf_pipe.named_steps['randomforest']

    return [pca, ica, rp, rf]