def compare_cluster_runtime(data, n_clusters, n_components):
    t0 = time()
    features = data[0]
    KMeans(n_clusters=n_clusters).fit_transform(features)
    t1 = time() - t0
    t0 = time()
    mixture.GMM(n_components=n_clusters).fit(features)
    t2 = time() - t0
    reduced_higgs_data, t3 = pca_eval.transform(data, n_components=n_components)
    reduced_higgs_data, t4 = rand_projections.transform(data, n_components=n_components)
    reduced_higgs_data, t5 = ica_eval.transform(data, n_components=n_components)
    reduced_higgs_data, t6 = trunk_svd.transform(data, n_components=n_components)
    t0 = time()
    KMeans(n_clusters=n_clusters).fit_transform(reduced_higgs_data)
    t7 = time() - t0
    t0 = time()
    mixture.GMM(n_components=n_clusters).fit(reduced_higgs_data)
    t8 = time() - t0
    ser = pd.Series([t1, t2, t3, t4, t5, t6, t7, t8], index=['original Kmeans clustering',
                                                     'original GMM clustering',
                                                     'PCA', 'RCA', 'ICA', 'LSA',
                                                     'reduced Kmeans clustering',
                                                     'reduced GMM clustering'])
    ser.name = 'time'
    return ser
def gmm_transform(higgs_data, n_clusters, n_components):
    start = time()
    reduced_higgs_data, elapsed = pca_eval.transform(higgs_data, n_components=n_components)
    cluster_assignment = mixture.GMM(n_components=n_clusters).fit(reduced_higgs_data).predict(reduced_higgs_data)
    elapsed = time() - start
    reduced_higgs_data['cluster_assighment'] = cluster_assignment
    data = {'features': reduced_higgs_data, 'weights': higgs_data[1], 'labels': higgs_data[2]}
    return data, elapsed
def run_higg_dimensionality_reduction(higgs_data, n_components):
    pca_trns, pca_elapsed = pca_eval.transform(higgs_data, n_components=n_components)
    rand_proj_trns, rand_proj_elapsed = rand_projections.transform(higgs_data, n_components=n_components)
    ica_trns, ica_elapsed = ica_eval.transform(higgs_data, n_components=n_components)
    lsa_trns, lsa_elapsed = trunk_svd.transform(higgs_data, n_components=n_components)
    transformation_time = pd.Series([pca_elapsed, rand_proj_elapsed, ica_elapsed, lsa_elapsed],
                                    index=['PCA', 'RCA', 'ICA', 'LSA'],
                                    name='transformation_time')
    return {'PCA': pca_trns, 'RCA': rand_proj_trns, 'ICA': ica_trns, 'LSA': lsa_trns}, transformation_time
def kmeans_transform(higgs_data, n_clusters, n_components, display=False):
    start = time()
    reduced_higgs_data, elapsed = pca_eval.transform(higgs_data, n_components=n_components)
    cluster_data = KMeans(n_clusters=n_clusters).fit_transform(reduced_higgs_data)
    elapsed = time() - start
    for l in range(cluster_data.shape[1]):
        reduced_higgs_data['new_feature' + str(l)] = cluster_data[:, l]
    data = {'features': reduced_higgs_data, 'weights': higgs_data[1], 'labels': higgs_data[2]}
    if display and n_clusters == 2:
        df = pd.DataFrame.from_records(cluster_data, columns=['new_feature_' + str(n) for n in range(n_clusters)])
        df['label'] = higgs_data[2].values
        ax = df[df.label == 's'].plot(x='new_feature_0', y='new_feature_1',
                                      kind='scatter', color='darkgreen', label='signal')
        df[df.label == 'b'].plot(x='new_feature_0', y='new_feature_1',
                                 kind='scatter', color='darkred', ax=ax, label='background')
    return data, elapsed
Exemplo n.º 5
0
def run_higg_dimensionality_reduction(higgs_data, n_components):
    pca_trns, pca_elapsed = pca_eval.transform(higgs_data,
                                               n_components=n_components)
    rand_proj_trns, rand_proj_elapsed = rand_projections.transform(
        higgs_data, n_components=n_components)
    ica_trns, ica_elapsed = ica_eval.transform(higgs_data,
                                               n_components=n_components)
    lsa_trns, lsa_elapsed = trunk_svd.transform(higgs_data,
                                                n_components=n_components)
    transformation_time = pd.Series(
        [pca_elapsed, rand_proj_elapsed, ica_elapsed, lsa_elapsed],
        index=['PCA', 'RCA', 'ICA', 'LSA'],
        name='transformation_time')
    return {
        'PCA': pca_trns,
        'RCA': rand_proj_trns,
        'ICA': ica_trns,
        'LSA': lsa_trns
    }, transformation_time