def ICA(): global range_n_clusters range_n_clusters = [i for i in range(5, 62, 8)] _r, _c = get_data('{}/ICA'.format(OUTPUT), '37-') c_X, _ = _c main(c_X) _r, _c = get_data('{}/ICA'.format(OUTPUT), '45-') r_X, _ = _r main(r_X)
def comparePCA(p, ds): global BASE bests = [] params = p BASE = '{}/PCA'.format(OUTPUT) for param in params: _r, _c = get_data(BASE, param) r, _ = _r find_best(r, bests) bests = np.array(bests) plot_gmms(bests, ds)
def compare(dim_red_alg, p1, p2): global BASE r_bests = [] c_bests = [] params = p1 BASE = '{}/{}'.format(OUTPUT, dim_red_alg) for param in params: _r, _c = get_data(BASE, param) c, _ = _c find_best(c, c_bests) params = p2 BASE = '{}/{}'.format(OUTPUT, dim_red_alg) for param in params: _r, _c = get_data(BASE, param) r, _ = _r find_best(r, r_bests) r_bests = np.array(r_bests) c_bests = np.array(c_bests) print(r_bests) plot_gmms(r_bests, "Reviews") plot_gmms(c_bests, "Cancer")
import numpy as np np.random.seed(0) dir_path = dirname(realpath(__file__)) sys.path.insert(0, '{}/..'.format(dir_path)) from helpers.dim_reduction import run_dim_alg, get_data from helpers.constants import ICA_DIMS r_dims = c_dims = ICA_DIMS OUT = '{}/../../OUTPUT/ICA'.format(dir_path) BASE = '{}/../../OUTPUT/BASE'.format(dir_path) r, c = get_data(BASE) r_X, r_y = r c_X, c_y = c ica = FastICA(random_state=5) kurt = {} for dim in r_dims: ica.set_params(n_components=dim) tmp = ica.fit_transform(r_X) tmp = pd.DataFrame(tmp) tmp = tmp.kurt(axis=0) kurt[dim] = tmp.abs().mean() kurt = pd.Series(kurt) kurt.to_csv('{}/reviews kurtosis.csv'.format(OUT))
sys.path.insert(0, '{}/..'.format(dir_path)) from helpers.dim_reduction import get_data from helpers.scoring import metrics from helpers.constants import ICA_DIMS, SVD_DIMS_R as SVD_DIMS, RP_DIMS, PCA_DIMS from helpers.figures import bar_plot from helpers.clustering import r_clusters PCA = '{}/../OUTPUT/PCA/{}'.format(dir_path, 'datasets-w-cluster') ICA = '{}/../OUTPUT/ICA/{}'.format(dir_path, 'datasets-w-cluster') RP = '{}/../OUTPUT/RP/{}'.format(dir_path, 'datasets-w-cluster') SVD = '{}/../OUTPUT/SVD/{}'.format(dir_path, 'datasets-w-cluster') BASE = '{}/../OUTPUT/BASE'.format(dir_path) OUT = '{}/../OUTPUT/NN'.format(dir_path) data_BASE = get_data(BASE, prefix='', ds='r') data_ICA = []#[get_data(ICA, ds='r', suffix='%s-%s-datasets.km.hdf') for d, c in product(ICA_DIMS, r_clusters)] data_RP = []#[get_data(RP, ds='r', suffix='%s-%s-datasets.km.hdf') for d, c in product(RP_DIMS, r_clusters)] data_SVD = []#[get_data(SVD, ds='r', suffix='%s-%s-datasets.km.hdf') for d, c in product(SVD_DIMS, r_clusters)] data_PCA_KM = [(get_data(PCA, ds='r', suffix='%s-%s-datasets.km.hdf' % (c, d)), '%s-%s-km' % (c, d)) for d, c in product(PCA_DIMS, r_clusters)] data_PCA_GMM = [(get_data(PCA, ds='r', suffix='%s-%s-datasets.gmm.hdf' % (c, d)), '%s-%s-gmm' % (c, d)) for d, c in product(PCA_DIMS, r_clusters)] data_ICA_KM = [(get_data(ICA, ds='r', suffix='%s-%s-datasets.km.hdf' % (c, d)), '%s-%s-km' % (c, d)) for d, c in product(ICA_DIMS, r_clusters)] data_ICA_GMM = [(get_data(ICA, ds='r', suffix='%s-%s-datasets.gmm.hdf' % (c, d)), '%s-%s-gmm' % (c, d)) for d, c in product(ICA_DIMS, r_clusters)] data_SVD_KM = [(get_data(SVD, ds='r', suffix='%s-%s-datasets.km.hdf' % (c, d)), '%s-%s-km' % (c, d)) for d, c in product(SVD_DIMS, r_clusters)] data_SVD_GMM = [(get_data(SVD, ds='r', suffix='%s-%s-datasets.gmm.hdf' % (c, d)), '%s-%s-gmm' % (c, d)) for d, c in product(SVD_DIMS, r_clusters)] data_RP_KM = [(get_data(RP, ds='r', suffix='%s-%s-datasets.km.hdf' % (c, d)), '%s-%s-km' % (c, d)) for d, c in product(RP_DIMS, r_clusters)] data_RP_GMM = [(get_data(RP, ds='r', suffix='%s-%s-datasets.gmm.hdf' % (c, d)), '%s-%s-gmm' % (c, d)) for d, c in product(RP_DIMS, r_clusters)]
dir_path = dirname(realpath(__file__)) sys.path.insert(0, '{}/..'.format(dir_path)) from helpers.dim_reduction import get_data from helpers.scoring import metrics from helpers.constants import ICA_DIMS, SVD_DIMS_R as SVD_DIMS, RP_DIMS, PCA_DIMS from helpers.figures import bar_plot PCA = '{}/../OUTPUT/PCA'.format(dir_path) ICA = '{}/../OUTPUT/ICA'.format(dir_path) RP = '{}/../OUTPUT/RP'.format(dir_path) SVD = '{}/../OUTPUT/SVD'.format(dir_path) BASE = '{}/../OUTPUT/BASE'.format(dir_path) OUT = '{}/../OUTPUT/NN'.format(dir_path) data_BASE = get_data(BASE, '', 'r') data_ICA = [get_data(ICA, '%d-' % i, 'r') for i in ICA_DIMS] data_RP = [get_data(RP, '%d-' % i, 'r') for i in RP_DIMS] data_SVD = [get_data(SVD, '%d-' % i, 'r') for i in SVD_DIMS] data_PCA = [get_data(PCA, '%s-' % i, 'r') for i in PCA_DIMS] np.random.seed(0) def get_schwifty(X, y, dataset, dims, classifier): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) start = clock() classifier.fit(X_train, y_train) pred = classifier.predict(X_test) elapsed = clock() - start print('%ss for %s at %s dimensions' % (elapsed, dataset, dims))
def runitr(p): r_X, _ = get_data('{}/SVD'.format(OUTPUT), '%d-' % p, 'r') print('Reviews at %s dimensions' % p) main(r_X, 'Reviews', ' (%s dims)' % p)
def runitc(p): c_X, _ = get_data('{}/SVD'.format(OUTPUT), '%s-' % p, 'c') print('Cancer at %s dimensions' % p) main(c_X, 'Cancer', ' (%s dims)' % p)
def runit(p): _r, _c = get_data('{}/PCA'.format(OUTPUT), '%s-' % p) r_X, _ = _r c_X, _ = _c main(r_X) main(c_X)
def runitr(p): _r, _c = get_data('{}/RP'.format(OUTPUT), '%s-' % p) r_X, _ = _r print('Reviews at %s dimensions' % p) main(r_X, 'Reviews', ' (%s dims)' % p)
def runitc(p): _r, _c = get_data('{}/RP'.format(OUTPUT), '%s-' % p) c_X, _ = _c print('Cancer at %s dimensions' % p) main(c_X, 'Cancer', ' (%s dims)' % p)
from sklearn import mixture from helpers.dim_reduction import get_data print(__doc__) dir_path = dirname(realpath(__file__)) output_dir = sys.argv[1] if len(sys.argv) >= 2 else 'BASE' OUTPUT = '{}/../OUTPUT'.format(dir_path) OUT = '{}/{}'.format(OUTPUT, output_dir) BASE = '{}/BASE'.format(OUTPUT) # Generating the sample data from make_blobs # This particular setting has one distinct cluster and 3 clusters placed close # together. _r, _c = get_data(BASE) r_X, r_y = _r c_X, c_y = _c r_components = [8, 13, 21, 34, 55, 89, 104, 119, 134, 159] c_components = [8, 10, 14, 18, 25, 35, 45, 55, 65, 75] n_components_range = None def find_best(X, bic): #pylint: disable cv_types = ['spherical', 'tied', 'diag', 'full'] lowest_bic = np.infty for cv_type in cv_types: for n_components in n_components_range: # Fit a Gaussian mixture with EM gmm = mixture.GaussianMixture(n_components=n_components, covariance_type=cv_type) gmm.fit(X)