示例#1
0
def ICA():
    global range_n_clusters
    range_n_clusters = [i for i in range(5, 62, 8)]
    _r, _c = get_data('{}/ICA'.format(OUTPUT), '37-')
    c_X, _ = _c
    main(c_X)
    _r, _c = get_data('{}/ICA'.format(OUTPUT), '45-')
    r_X, _ = _r
    main(r_X)
示例#2
0
def comparePCA(p, ds):
    global BASE
    bests = []
    params = p
    BASE = '{}/PCA'.format(OUTPUT)
    for param in params:
        _r, _c = get_data(BASE, param)
        r, _ = _r
        find_best(r, bests)

    bests = np.array(bests)
    plot_gmms(bests, ds)
示例#3
0
def compare(dim_red_alg, p1, p2):
    global BASE
    r_bests = []
    c_bests = []
    params = p1
    BASE = '{}/{}'.format(OUTPUT, dim_red_alg)
    for param in params:
        _r, _c = get_data(BASE, param)
        c, _ = _c
        find_best(c, c_bests)

    params = p2
    BASE = '{}/{}'.format(OUTPUT, dim_red_alg)
    for param in params:
        _r, _c = get_data(BASE, param)
        r, _ = _r
        find_best(r, r_bests)

    r_bests = np.array(r_bests)
    c_bests = np.array(c_bests)
    print(r_bests)

    plot_gmms(r_bests, "Reviews")
    plot_gmms(c_bests, "Cancer")
import numpy as np

np.random.seed(0)

dir_path = dirname(realpath(__file__))
sys.path.insert(0, '{}/..'.format(dir_path))

from helpers.dim_reduction import run_dim_alg, get_data
from helpers.constants import ICA_DIMS

r_dims = c_dims = ICA_DIMS

OUT = '{}/../../OUTPUT/ICA'.format(dir_path)
BASE = '{}/../../OUTPUT/BASE'.format(dir_path)

r, c = get_data(BASE)
r_X, r_y = r
c_X, c_y = c

ica = FastICA(random_state=5)
kurt = {}
for dim in r_dims:
    ica.set_params(n_components=dim)
    tmp = ica.fit_transform(r_X)
    tmp = pd.DataFrame(tmp)
    tmp = tmp.kurt(axis=0)
    kurt[dim] = tmp.abs().mean()

kurt = pd.Series(kurt)
kurt.to_csv('{}/reviews kurtosis.csv'.format(OUT))
示例#5
0
sys.path.insert(0, '{}/..'.format(dir_path))

from helpers.dim_reduction import get_data
from helpers.scoring import metrics
from helpers.constants import ICA_DIMS, SVD_DIMS_R as SVD_DIMS, RP_DIMS, PCA_DIMS
from helpers.figures import bar_plot
from helpers.clustering import r_clusters

PCA = '{}/../OUTPUT/PCA/{}'.format(dir_path, 'datasets-w-cluster')
ICA = '{}/../OUTPUT/ICA/{}'.format(dir_path, 'datasets-w-cluster')
RP = '{}/../OUTPUT/RP/{}'.format(dir_path, 'datasets-w-cluster')
SVD = '{}/../OUTPUT/SVD/{}'.format(dir_path, 'datasets-w-cluster')
BASE = '{}/../OUTPUT/BASE'.format(dir_path)
OUT = '{}/../OUTPUT/NN'.format(dir_path)

data_BASE = get_data(BASE, prefix='', ds='r')
data_ICA = []#[get_data(ICA, ds='r', suffix='%s-%s-datasets.km.hdf') for d, c in product(ICA_DIMS, r_clusters)]
data_RP = []#[get_data(RP, ds='r', suffix='%s-%s-datasets.km.hdf') for d, c in product(RP_DIMS, r_clusters)]
data_SVD = []#[get_data(SVD, ds='r', suffix='%s-%s-datasets.km.hdf') for d, c in product(SVD_DIMS, r_clusters)]

data_PCA_KM = [(get_data(PCA, ds='r', suffix='%s-%s-datasets.km.hdf' % (c, d)), '%s-%s-km' % (c, d)) for d, c in product(PCA_DIMS, r_clusters)]
data_PCA_GMM = [(get_data(PCA, ds='r', suffix='%s-%s-datasets.gmm.hdf' % (c, d)), '%s-%s-gmm' % (c, d)) for d, c in product(PCA_DIMS, r_clusters)]

data_ICA_KM = [(get_data(ICA, ds='r', suffix='%s-%s-datasets.km.hdf' % (c, d)), '%s-%s-km' % (c, d)) for d, c in product(ICA_DIMS, r_clusters)]
data_ICA_GMM = [(get_data(ICA, ds='r', suffix='%s-%s-datasets.gmm.hdf' % (c, d)), '%s-%s-gmm' % (c, d)) for d, c in product(ICA_DIMS, r_clusters)]

data_SVD_KM = [(get_data(SVD, ds='r', suffix='%s-%s-datasets.km.hdf' % (c, d)), '%s-%s-km' % (c, d)) for d, c in product(SVD_DIMS, r_clusters)]
data_SVD_GMM = [(get_data(SVD, ds='r', suffix='%s-%s-datasets.gmm.hdf' % (c, d)), '%s-%s-gmm' % (c, d)) for d, c in product(SVD_DIMS, r_clusters)]

data_RP_KM = [(get_data(RP, ds='r', suffix='%s-%s-datasets.km.hdf' % (c, d)), '%s-%s-km' % (c, d)) for d, c in product(RP_DIMS, r_clusters)]
data_RP_GMM = [(get_data(RP, ds='r', suffix='%s-%s-datasets.gmm.hdf' % (c, d)), '%s-%s-gmm' % (c, d)) for d, c in product(RP_DIMS, r_clusters)]
dir_path = dirname(realpath(__file__))
sys.path.insert(0, '{}/..'.format(dir_path))

from helpers.dim_reduction import get_data
from helpers.scoring import metrics
from helpers.constants import ICA_DIMS, SVD_DIMS_R as SVD_DIMS, RP_DIMS, PCA_DIMS
from helpers.figures import bar_plot

PCA = '{}/../OUTPUT/PCA'.format(dir_path)
ICA = '{}/../OUTPUT/ICA'.format(dir_path)
RP = '{}/../OUTPUT/RP'.format(dir_path)
SVD = '{}/../OUTPUT/SVD'.format(dir_path)
BASE = '{}/../OUTPUT/BASE'.format(dir_path)
OUT = '{}/../OUTPUT/NN'.format(dir_path)

data_BASE = get_data(BASE, '', 'r')
data_ICA = [get_data(ICA, '%d-' % i, 'r') for i in ICA_DIMS]
data_RP = [get_data(RP, '%d-' % i, 'r') for i in RP_DIMS]
data_SVD = [get_data(SVD, '%d-' % i, 'r') for i in SVD_DIMS]
data_PCA = [get_data(PCA, '%s-' % i, 'r') for i in PCA_DIMS]

np.random.seed(0)


def get_schwifty(X, y, dataset, dims, classifier):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    start = clock()
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    elapsed = clock() - start
    print('%ss for %s at %s dimensions' % (elapsed, dataset, dims))
示例#7
0
 def runitr(p):
     r_X, _ = get_data('{}/SVD'.format(OUTPUT), '%d-' % p, 'r')
     print('Reviews at %s dimensions' % p)
     main(r_X, 'Reviews', ' (%s dims)' % p)
示例#8
0
 def runitc(p):
     c_X, _ = get_data('{}/SVD'.format(OUTPUT), '%s-' % p, 'c')
     print('Cancer at %s dimensions' % p)
     main(c_X, 'Cancer', ' (%s dims)' % p)
示例#9
0
 def runit(p):
     _r, _c = get_data('{}/PCA'.format(OUTPUT), '%s-' % p)
     r_X, _ = _r
     c_X, _ = _c
     main(r_X)
     main(c_X)
示例#10
0
 def runitr(p):
     _r, _c = get_data('{}/RP'.format(OUTPUT), '%s-' % p)
     r_X, _ = _r
     print('Reviews at %s dimensions' % p)
     main(r_X, 'Reviews', ' (%s dims)' % p)
示例#11
0
 def runitc(p):
     _r, _c = get_data('{}/RP'.format(OUTPUT), '%s-' % p)
     c_X, _ = _c
     print('Cancer at %s dimensions' % p)
     main(c_X, 'Cancer', ' (%s dims)' % p)
示例#12
0
from sklearn import mixture

from helpers.dim_reduction import get_data

print(__doc__)

dir_path = dirname(realpath(__file__))
output_dir = sys.argv[1] if len(sys.argv) >= 2 else 'BASE'
OUTPUT = '{}/../OUTPUT'.format(dir_path)
OUT = '{}/{}'.format(OUTPUT, output_dir)
BASE = '{}/BASE'.format(OUTPUT)

# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
_r, _c = get_data(BASE)
r_X, r_y = _r
c_X, c_y = _c

r_components = [8, 13, 21, 34, 55, 89, 104, 119, 134, 159]
c_components = [8, 10, 14, 18, 25, 35, 45, 55, 65, 75]
n_components_range = None
def find_best(X, bic): #pylint: disable
    cv_types = ['spherical', 'tied', 'diag', 'full']
    lowest_bic = np.infty
    for cv_type in cv_types:
        for n_components in n_components_range:
            # Fit a Gaussian mixture with EM
            gmm = mixture.GaussianMixture(n_components=n_components,
                                          covariance_type=cv_type)
            gmm.fit(X)