Пример #1
0
# 导入数据
data = pd.read_csv("three_class_data.csv", header=0)

x = data[["x", "y"]]

# 对聚类方法依次命名
cluster_names = ['KMeans', 'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift', 'SpectralClustering', 'AgglomerativeClustering', 'Birch', 'DBSCAN']

# 确定聚类方法相应参数
cluster_estimators = [
    cluster.KMeans(n_clusters=3),
    cluster.MiniBatchKMeans(n_clusters=3),
    cluster.AffinityPropagation(),
    cluster.MeanShift(),
    cluster.SpectralClustering(n_clusters=3),
    cluster.AgglomerativeClustering(n_clusters=3),
    cluster.Birch(n_clusters=3),
    cluster.DBSCAN()
]

# 为绘制子图准备
plot_num = 1

# 依次运行不同的聚类方法
for name, model in zip(cluster_names, cluster_estimators):

    tic = time.time()

    # 建立模型
    model.fit(x)
Пример #2
0
    connectivity = kneighbors_graph(X,
                                    n_neighbors=params['n_neighbors'],
                                    include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # Create cluster objects
    # ============
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    optics = cluster.OPTICS(min_samples=params['min_samples'],
                            xi=params['xi'],
                            min_cluster_size=params['min_cluster_size'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average",
        affinity="cityblock",
        n_clusters=params['n_clusters'],
        connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                  covariance_type='full')
Пример #3
0
def run_config(fname, data, index, algo):
    #print('Launching', fname, file=sys.stderr)
    today = datetime.datetime.now()
    print(today.strftime("%Y-%m-%d %H.%M.%S") ) # 2017-04-05-00.18.00
    with open(fname, 'a') as result:
        try:
            X = []
            with open(data, 'r') as f:
                content = f.readlines()
                for x in content:
                    row = x.split()
                    res = []
                    for i in row:
                        res.append(float(i))
                    X.append(res)

            #n_clusters = 15
            X1 = StandardScaler().fit_transform(np.array(X))
            #connectivity = kneighbors_graph(X1, n_neighbors=2, include_self=False)

            res = cluster.SpectralClustering(eigen_solver="arpack", affinity="nearest_neighbors").fit(X1)
            labels = res.labels_
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

            cl = clusterization(X1, labels, n_clusters, index)
            m = cl.init_measure()

            for run_num in range(0, 3):
                #iterable CVI computation
                strategy = algo(deepcopy(cl), m)
                new_measure_iter, iters, t = strategy.run()

                # print number of run

                result.write("Run {}\n".format((run_num + 1)))

                #print('Launching iterable computation: ', fname, file=sys.stderr)
                #result.write('Launching iterable computation: ' + fname)
                result.write("Measure improvement   {}\n".format(abs(m - new_measure_iter)))
                result.write("from                  {}\n".format(m))
                result.write("to                    {}\n".format(new_measure_iter))
                result.write("Iterations performed  {}\n".format(iters))
                result.write("Time spent            {}\n".format(t))

                # full CVI computation with time limit
                strategy = algo(deepcopy(cl), m)
                new_measure_full, iters, t = strategy.run_full()
                #print('Launching full without CVI limit: ', fname, file=sys.stderr)
                #result.write('Launching full without CVI limit: ' + fname)
                result.write("Measure improvement   {}\n".format(abs(m - new_measure_full)))
                result.write("from                  {}\n".format(m))
                result.write("to                    {}\n".format(new_measure_full))
                result.write("Iterations performed  {}\n".format(iters))
                result.write("Time spent            {}\n".format(t))

                # full CVI computation with measure limit on CVI value
                # obtained from iterable computation launch
                strategy = algo(deepcopy(cl), m)
                new_measure_full_CVI_limit, iters, t = strategy.run_full_CVI_limit(new_measure_iter)
                #print('Launching full with CVI limit: ', fname, file=sys.stderr)
                #result.write('Launching full with CVI limit: ' + fname)
                result.write("Measure improvement   {}\n".format(abs(m - new_measure_full_CVI_limit)))
                result.write("from                  {}\n".format(m))
                result.write("to                    {}\n".format(new_measure_full_CVI_limit))
                result.write("Iterations performed  {}\n".format(iters))
                result.write("Time spent            {}\n\n".format(t))
        except:
            traceback.print_exc(file=result)
def test_clustering(data, lons, lats, N_CLUSTERS):
    pred_dict = {}
    np.random.seed(0)

    n_samples = 1500

    colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
    colors = np.hstack([colors] * 20)

    plt.figure(figsize=(20, 15))
    plt.subplots_adjust(left=.001,
                        right=.999,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)

    plot_num = 1
    for i_dataset, dataset in enumerate([data]):
        X = dataset
        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(X, n_neighbors=10)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)

        # Compute distances
        #distances = np.exp(-euclidean_distances(X))
        distances = euclidean_distances(X)

        # create clustering estimators
        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        two_means = cluster.MiniBatchKMeans(n_clusters=N_CLUSTERS)
        ward = cluster.AgglomerativeClustering(n_clusters=N_CLUSTERS,
                                               linkage='ward',
                                               connectivity=connectivity)
        spectral = cluster.SpectralClustering(n_clusters=N_CLUSTERS,
                                              eigen_solver='arpack',
                                              affinity="nearest_neighbors")
        dbscan = cluster.DBSCAN(eps=.2)
        affinity_propagation = cluster.AffinityPropagation(damping=.9,
                                                           preference=-200)

        average_linkage = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=N_CLUSTERS,
            connectivity=connectivity)

        for name, algorithm in [('MiniBatchKMeans', two_means),
                                ('AffinityPropagation', affinity_propagation),
                                ('MeanShift', ms),
                                ('SpectralClustering', spectral),
                                ('Ward', ward),
                                ('AgglomerativeClustering', average_linkage),
                                ('DBSCAN', dbscan)]:
            # predict cluster memberships
            t0 = time.time()
            algorithm.fit(X)
            t1 = time.time()
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)

            # plot
            plt.subplot(4, 7, plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)
            plt.scatter(lons, lats, color=colors[y_pred].tolist(), s=10)

            if hasattr(algorithm, 'cluster_centers_'):
                try:
                    centers = algorithm.cluster_centers_
                    center_colors = colors[:len(centers)]
                    plt.scatter(centers[:, 0],
                                centers[:, 1],
                                s=100,
                                c=center_colors)
                except:
                    continue
            plt.xlim(min(lons) - 1, max(lons) + 1)
            plt.ylim(min(lats) - 1, max(lats) + 1)
            plt.xticks(())
            plt.yticks(())
            plt.text(.99,
                     .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')
            plot_num += 1
            pred_dict[name] = [lons, lats, colors[y_pred].tolist()]

    plt.show()
    return pred_dict
Пример #5
0
def cluster_matrices(submatrices_dict, k, method='kmeans', how='full'):
    """
    clusters the submatrices per chromosome


    Parameters
    ----------
    submatrices_dict key: chrom name, values, a list of submatrices
    k number of clusters
    method either kmeans, hierarchical or spectral
    how how to cluster. Options are 'full', 'center' and 'diagonal'. More info in the argparse options

    Returns
    -------

    indices dict key: chrom_name, value: list of list, with one list per cluster with the ids of the submatrices
                 that belong to that list
    """
    clustered_dict = {}
    for chrom in submatrices_dict:
        log.info("Length of entry: {}".format(len(submatrices_dict[chrom])))
        if len(submatrices_dict[chrom]) < k:
            log.info("number of the submatrices on chromosome {} is less than {}. Clustering is skipped.".format(chrom, k))
            k = 1
        submat_vectors = []
        shape = submatrices_dict[chrom][0].shape
        center_bin = (shape[0] + 1) // 2
        for submatrix in submatrices_dict[chrom]:
            if how == 'diagonal':
                # take from each matrix the diagonal
                submat_vectors.append(submatrix.diagonal())
            elif how == 'center':
                # take the mean of a  smaller submatrix of 3 x 3 centered on the submatrix
                submat_vectors.append(
                    submatrix[center_bin - 2:center_bin + 1, center_bin - 2:center_bin + 1].reshape((1, 9)).mean())
            else:
                # Transform list of submatrices in an array of shape:
                # shape = (num_submatrices, submatrix.shape[0] * submatrix.shape[1]
                # In other words, each submatrix is converted into a row of the matrix
                submat_vectors.append(submatrix.reshape((1, shape[0] * shape[1])))

        matrix = np.vstack(submat_vectors)
        if how == 'diagonal':
            assert matrix.shape == (len(submatrices_dict[chrom]), shape[0])
        elif how == 'center':
            assert matrix.shape == (len(submatrices_dict[chrom]), 1)
        else:
            assert matrix.shape == (len(submatrices_dict[chrom]), shape[0] * shape[1])

        # remove outliers
        out_ind = get_outlier_indices(matrix, max_deviation=2)
        if out_ind is not None and len(np.flatnonzero(out_ind)) > 0:
            log.info("Outliers detected in chrom: {}. Number of outliers: {}".
                     format(chrom, len(np.flatnonzero(out_ind))))

            # keep in matrix all indices that are not outliers
            matrix = matrix[np.logical_not(out_ind), :]

        if np.any(np.isnan(matrix)):
            # replace nans for 0 otherwise kmeans produces a weird behaviour
            log.warning("For clustering nan values have to be replaced by zeros.")
            matrix[np.isnan(matrix)] = 0

        if method == 'kmeans':
            clustering = skclust.KMeans(n_clusters=k, random_state=0).fit(matrix)
            cluster_labels = clustering.labels_
        if method == 'hierarchical':
            clustering = skclust.AgglomerativeClustering(n_clusters=k).fit(matrix)
            cluster_labels = clustering.labels_
        if method == 'spectral':
            clustering = skclust.SpectralClustering(n_clusters=k, assign_labels="discretize", random_state=0).fit(matrix)
            cluster_labels = clustering.labels_

        # sort clusters
        clustered_dict[chrom] = []
        for cluster in range(k):
            cluster_ids = np.flatnonzero(cluster_labels == cluster)
            clustered_dict[chrom].append(cluster_ids)

    return clustered_dict
print("\nDATASET NORMALIZADO:\n")
print(X)

# técnicas de agrupamento, NECESSÁRIO ESTUDAR E OTIMIZAR OS PARÂMETROS DE CADA TÉCNICA
two_means = cluster.MiniBatchKMeans(n_clusters=2,
                                    init='random',
                                    n_init=10,
                                    max_iter=300,
                                    tol=1e-04,
                                    random_state=0)

bandwidth = cluster.estimate_bandwidth(X, quantile=0.95)
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

spectral = cluster.SpectralClustering(n_clusters=8,
                                      eigen_solver='arpack',
                                      affinity="nearest_neighbors",
                                      random_state=0)

connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)
ward = cluster.AgglomerativeClustering(n_clusters=3,
                                       linkage='ward',
                                       connectivity=connectivity)
average_linkage = cluster.AgglomerativeClustering(linkage="average",
                                                  affinity="cityblock",
                                                  n_clusters=3,
                                                  connectivity=connectivity)

dbscan = cluster.DBSCAN(eps=5)
birch = cluster.Birch(n_clusters=3, threshold=0.7)
gmm = mixture.GaussianMixture(n_components=2,
Пример #7
0
def do():
    ai = AI()
    ai.load()
    # ai.learn()
    params = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }
    bandwidth = cluster.estimate_bandwidth(ai.x, quantile=params['quantile'])
    connectivity = kneighbors_graph(ai.x,
                                    n_neighbors=params['n_neighbors'],
                                    include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average",
        affinity="cityblock",
        n_clusters=params['n_clusters'],
        connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                  covariance_type='full')
    clustering_algorithms = (('MiniBatchKMeans', two_means),
                             ('AffinityPropagation', affinity_propagation),
                             ('MeanShift', ms), ('SpectralClustering',
                                                 spectral), ('Ward', ward),
                             ('AgglomerativeClustering',
                              average_linkage), ('DBSCAN', dbscan),
                             ('Birch', birch), ('GaussianMixture', gmm))

    for name, algorithm in clustering_algorithms:
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="the number of connected components of the " +
                "connectivity matrix is [0-9]{1,2}" +
                " > 1. Completing it to avoid stopping the tree early.",
                category=UserWarning)
            warnings.filterwarnings(
                "ignore",
                message="Graph is not fully connected, spectral embedding" +
                " may not work as expected.",
                category=UserWarning)
            try:
                algorithm.fit(ai.x)
            except Exception as e:
                continue

        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(numpy.int)
        else:
            y_pred = algorithm.predict(ai.x)
        if max(y_pred) > 3:
            continue
        known_groups = {}
        for i, group in enumerate(ai.y):
            group = int(group)
            if group not in known_groups:
                known_groups[group] = []
            known_groups[group].append(i)
        guessed_groups = {}
        for i, group in enumerate(y_pred):
            if group not in guessed_groups:
                guessed_groups[group] = []
            guessed_groups[group].append(i)
        for k in known_groups:
            for g in guessed_groups:
                print(
                    k, g,
                    len(set(known_groups[k]).intersection(guessed_groups[g])))
Пример #8
0
    kchosen = 15  # based on visual inspection of distortion.png and spectrum.png
    eigvecs_k, codebook, distortion = results[kchosen]
    print 'chose k=', kchosen, ' distortion ', distortion
    membership, _ = scipy.cluster.vq.vq(eigvecs_k, codebook)
    for i in xrange(kchosen):
        print 'cluster', i
        print ','.join(names[membership == i])


names, X = load_data()
ga_ind = np.flatnonzero(names == 'GA')
me_ind = np.flatnonzero(names == 'ME')
X[ga_ind, me_ind] = 0.  #hackityhackhackhack
X[me_ind, ga_ind] = 0.
spectral = cluster.SpectralClustering(n_clusters=16,
                                      eigen_solver='arpack',
                                      affinity="precomputed")
spectral.fit(X)
spectral.labels_
print ''
for label in np.unique(spectral.labels_):
    clust = names[spectral.labels_ == label]
    for cl in clust:
        if cl == clust[-1]:
            print cl,
        else:
            print cl + ",",
    print ''

if __name__ == "__main__":
    spectral_cluster(names, X)
Пример #9
0
# In[75]:

print("Explained variance: ", explained_variance_score(y, predictions))

# In[76]:

print("R2 score: ", r2_score(y, predictions))

# ###  Clustering

# In[77]:

from sklearn import cluster

spectral = cluster.SpectralClustering(n_clusters=4,
                                      eigen_solver='arpack',
                                      affinity='nearest_neighbors')

# In[78]:

spectral.fit(boston.data)

# In[79]:

boston_df['category'] = spectral.labels_
boston_df['price'] = boston.target
house_clusters = boston_df.groupby('category').mean().sort_values('price')
house_clusters.index = ['low', 'mid_low', 'mid_high', 'high']
house_clusters[['price', 'CRIM', 'RM', 'AGE', 'DIS']]
Пример #10
0
def main(csps_data):
    global db_time, feature_set, global_args
    # n_clusters = cluster_options['n_clusters']
    # id measure response  year organisation group  score
    # print(csps_data.head())

    # First get data fram into the right shape
    if (feature_set == 'demographics'):
        csps_data = csps_data.set_index(['organisation', 'org', 'year'])
    else:
        csps_data = pd.pivot_table(
            csps_data,
            values='score',
            index=['organisation', 'year', 'headcount', 'org', 'par'],
            columns=['measure'],
            aggfunc=np.sum)
        #csps_data = pd.pivot_table(csps_data, values='score', index=['organisation', 'year', 'org', 'par'], columns=['measure', 'headcount'], aggfunc=np.sum)

    #print(feature_set)
    #print(csps_data.head())

    # Now, because the EEI is required later and therefore retrieved, but is to be excluded from the questions and demographics, split the EEI column out and delete

    eei = csps_data['EEI']
    #print(eei.tolist())

    if (feature_set != 'zzzzthemes'):
        csps_data = csps_data.drop('EEI', 1)

    #print( '*' * 44 )
    #print(csps_data.head())

    # The data should always be a 2D array, shape (n_samples, n_features)
    # print(csps_data.head())

    # To get the boolean mask where values are nan
    # cpvnm: CSPS data, pivoted, null mask
    # csps_data = pd.isnull(csps_data)
    # print(csps_data.head())
    '''
    if (feature_set == 'themes'):
        
        dist_test1 = csps_data['EEI'].tolist()
        dist_test2 = csps_data['MW'].tolist()
#         print(dist_test.head())
#         dist_test.reset_index(True)
#         print(dist_test.head())
        print(dist_test1)
        print(dist_test2)
        zz = zip(dist_test1, dist_test2)
        
        print(map(list, zz))
        
        from sklearn.metrics.pairwise import euclidean_distances
        X_pairs = [[0, 1], [1, 1]]
        # distance between rows of X
        print(euclidean_distances(dist_test1, dist_test2))
#         print(euclidean_distances(X_pairs, X_pairs))
        # array([[ 0.,  1.], [ 1.,  0.]])
        # get distance to origin
#         print(euclidean_distances(X_pairs, [[0, 0]]))
        # array([[ 1.        ], [ 1.41421356]])
    '''

    #print(csps_data.columns)

    # Filling missing data: CSPS data, pivoted, no-null
    # csps_data = csps_data.fillna(value=0)
    # print(csps_data.head())

    #'KMeans', 'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift', 'SpectralClustering', 'AgglomerativeClustering', 'DBSCAN', 'Birch'

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(csps_data)

    start_cluster_time = timer()
    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    if (algorithm == 'KMeans'):
        clustered = cluster.KMeans(n_clusters=cluster_options['n_clusters'])

    elif (algorithm == 'MiniBatchKMeans'):
        clustered = cluster.MiniBatchKMeans(
            n_clusters=cluster_options['n_clusters'])

    elif (algorithm == 'AffinityPropagation'):
        clustered = cluster.AffinityPropagation()

    elif (algorithm == 'MeanShift'):
        bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
        clustered = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

    elif (algorithm == 'SpectralClustering'):
        clustered = cluster.SpectralClustering(
            n_clusters=cluster_options['n_clusters'],
            eigen_solver='arpack',
            affinity="nearest_neighbors")

    elif (algorithm == 'AffinityPropagation'):
        clustered = cluster.AffinityPropagation(damping=.9, preference=-200)

    elif (algorithm == 'AgglomerativeClustering'):
        clustered = cluster.AgglomerativeClustering(
            linkage='ward',
            n_clusters=cluster_options['n_clusters'],
            connectivity=connectivity)

    elif (algorithm == 'AC_average_linkage'):
        clustered = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=cluster_options['n_clusters'],
            connectivity=connectivity)

    elif (algorithm == 'DBSCAN'):
        clustered = cluster.DBSCAN(eps=.5, algorithm='auto', leaf_size=40)

    elif (algorithm == 'Birch'):
        clustered = cluster.Birch(n_clusters=cluster_options['n_clusters'])

    else:
        clustered = cluster.KMeans(n_clusters=cluster_options['n_clusters'])

    clustered.fit(X)

    if (algorithm == 'MeanShift' or algorithm == 'DBSCAN'):
        silhouette_score = -1
    else:
        silhouette_score = metrics.silhouette_score(X,
                                                    clustered.labels_,
                                                    metric='euclidean')


#     neigh = NearestNeighbors(2, 0.4)
#     neigh.fit(X)
#     NearestNeighbors(algorithm='auto', leaf_size=30)
#     nbrs = neigh.radius_neighbors([[0, 0, 1.3]], 0.4, return_distance=False)
#     rng = neigh.radius_neighbors([X[1]])
#     print('NearestNeighbors')
#     print(X.shape[1])
#     print(np.asarray(rng[0][0]))

    end_cluster_time = timer()
    # this works, but isn't useful any more
    # csps_data['cluster_id'] = clustered.labels_
    if (feature_set != 'demographics'):
        #         org_year = zip(*csps_data.index.values)  #['organisation', 'year', 'org', 'par']
        #         orgs = pd.Series(org_year[0])
        #         years = pd.Series(org_year[1])
        #         org_acronym = pd.Series(org_year[2])
        #         par_acronym = pd.Series(org_year[3])
        #         clusters = pd.Series(clustered.labels_.tolist())
        org_year = zip(*csps_data.index.values
                       )  #['organisation', 'year', 'headcount', 'org', 'par']
        orgs = pd.Series(org_year[0])
        years = pd.Series(org_year[1])
        headcount = pd.Series(org_year[2])
        org_acronym = pd.Series(org_year[3])
        par_acronym = pd.Series(org_year[4])
        clusters = pd.Series(clustered.labels_.tolist())
    else:
        org_year = zip(
            *csps_data.index.values)  #['organisation', 'org', 'year']
        orgs = pd.Series(org_year[0])
        years = pd.Series(org_year[2])
        org_acronym = pd.Series(org_year[1])
        clusters = pd.Series(clustered.labels_.tolist())

    org_year.append(clustered.labels_.tolist())

    #1 - organisation
    df = pd.DataFrame(orgs)  #, 'organisation'

    #2 - year
    df['year'] = years

    #3 - headcount
    if (feature_set != 'demographics'):
        df['headcount'] = headcount
    else:
        df['headcount'] = np.array([0] * len(df))  #csps_data['headcount']

    #4 - cluster id
    df['cluster'] = clusters

    #5 - acronym
    df['org'] = org_acronym

    #6 - parent
    if (feature_set != 'demographics'):
        df['parent'] = par_acronym
    else:
        df['parent'] = np.array(['x'] * len(df))

    #7 - EEI
    df['EEI'] = eei.tolist()
    #     if (feature_set != 'themes'):
    #         df['EEI'] = eei.tolist()
    #     else:
    #         df['EEI'] = csps_data['EEI']

    category_labels = ['EEI', 'headcount', 'year']

    # descriptive statistics for each cluster
    #df[df.A > 0]
    #df.groupby('cluster')
    #cluster_info = df.groupby(['cluster']).get_group(1)
    #grouped = df(['EEI', 'headcount', 'cluster']).groupby('cluster')
    grouped = df.groupby('cluster')
    cluster_info = grouped.describe().fillna('missing')
    #     for name, group in grouped:
    #         print(name)
    #         print(group)

    #df = df.sort_values(by='cluster')

    # use describe to show quick summary statistics of the data
    #df.describe();
    end_time = timer()
    cluster_time = (end_cluster_time - start_cluster_time)
    total_time = (end_time - start_time)

    if (algorithm == 'AffinityPropagation'):
        other_output = json.dumps([{
            'silhouette_score': silhouette_score,
            'db_time': db_time,
            'cluster_time': cluster_time,
            'total_time': total_time,
            'feature_set': feature_set,
            'cluster_info': cluster_info.values.tolist(),
            'category_labels': category_labels
        },
                                   clustered.cluster_centers_indices_.tolist()
                                   ])

    output = json.dumps([{
        'silhouette_score': silhouette_score,
        'db_time': db_time,
        'cluster_time': cluster_time,
        'total_time': total_time,
        'feature_set': feature_set,
        'cluster_info': cluster_info.values.tolist(),
        'category_labels': category_labels
    },
                         df.values.tolist()])
    #     output = json.dumps(other_output)

    print output
Пример #11
0
def learn_Clustering(args):
    model = cluster.SpectralClustering(n_clusters=3)
    model.fit(args["data_in"].data.numpy())
    print("nClusters view " + str(args["id_in"]) + " : " +
          str(len(set(model.labels_))))
    return model
Пример #12
0
# encode categorical data and assign X to matrix of floats
X = pd.get_dummies(X, prefix=['cg', 'vr', 'or']).astype(float)

# Methods wich don't need K as an input
# 1. Affinity propagation
aprop = skc.AffinityPropagation().fit_predict(X)
# 4. DBSCAN
dbscan = skc.DBSCAN().fit_predict(X)

# Define number of clusters for those methdos that need it
K = 3
# 0. Kemeans
kmeans = skc.KMeans(n_clusters=K).fit_predict(X)

# 2. Spectral clustering
spclus = skc.SpectralClustering(n_clusters=K).fit_predict(X)
# 3. Agglomerative clustering
aggclus = skc.AgglomerativeClustering(n_clusters=K).fit_predict(X)

# Create data frame
cols = ['Kmeans', 'Spec', 'Agglo']
clusters = pd.DataFrame(np.vstack((kmeans, spclus, aggclus)).T, columns=cols)

#clusters.Aff_Prop.unique().plot()
#len(np.unique(aprop))


# Functions to use in the jupyter notebook
def describe_no_K_needed():
    print(20 * '==')
    print('Affinity Propagation found ' + str(len(np.unique(aprop))) +
Пример #13
0
#k-means
from sklearn.cluster import KMeans
clf_kmeans = KMeans(n_clusters=5)
kmeans_cluster = clf_kmeans.fit_predict(all_df)

#heirarchical clustering
from sklearn import cluster
clf_hc = cluster.AgglomerativeClustering(n_clusters=4)
hc_cluster = clf_hc.fit_predict(all_df)

#DBSCAN
clf_dbscan = cluster.DBSCAN(eps=0.4)
db_cluster = clf_dbscan.fit_predict(all_df)

#spectural clustering
clf_sc = cluster.SpectralClustering(n_clusters=4,n_neighbors=20)
sc_cluster = clf_sc.fit_predict(all_df)

#test
test = pd.read_csv('test.csv')
test = test[['0','1']].values

kmeans_predict = list()
hc_predict = list()
db_predict = list()
sc_predict = list()
for i in range(400):
        if kmeans_cluster[(test[i][0])] == kmeans_cluster[(test[i][1])]:
                kmeans_predict.append(1)
        elif kmeans_cluster[(test[i][0])] != kmeans_cluster[(test[i][1])]:
                kmeans_predict.append(0)
Пример #14
0
    model = GaussianMixture(n_components=nclust,init_params='kmeans')
    model.fit(X)
    clust_labels3 = model.predict(X)
    return (clust_labels3)
#y_pred = doGMM(data2,4)
def MeanShift(x,y):
    ms=cluster.MeanShift(x)
    ms_result=ms.fit_predict(y)
    return(ms_result)
#y_pred=MeanShift(0.1,data2)
def MiniKmeans(x, y):
    mb= cluster.MiniBatchKMeans(x)
    mb_result=mb.fit_predict(y)
    return(mb_result)
#y_pred = MiniKmeans(4,data)
spectral = cluster.SpectralClustering(n_clusters=4)
#y_pred= spectral.fit_predict(data2)
def Dbscan(x, y):
    db=cluster.DBSCAN(eps=x)
    db_result=db.fit_predict(y)
    return(db_result)
#y_pred = Dbscan(0.3,data2)
def Affinity(x, y,z):
    ap=cluster.AffinityPropagation(damping=x, preference=y)
    ap_result=ap.fit_predict(z)
    return(ap_result)
#y_pred = Affinity(0.9,-200,data2)
#Birch Clustering
def Bir(x, y):
    bi=cluster.Birch(n_clusters=x)
    bi_result=bi.fit_predict(y)
Пример #15
0
sys.path.append(path)

import numpy as np
from sklearn import cluster, metrics
from common_utils import *
from clustering_utils import *
from classification_utils import *

scoring = 's_score'

X, y = generate_synthetic_data_2d_clusters(n_samples=300,
                                           n_centers=4,
                                           cluster_std=0.60)
plot_data_2d(X)

spectral_estimator = cluster.SpectralClustering(affinity='nearest_neighbors',
                                                assign_labels='kmeans')
spectral_grid = {'n_clusters': list(range(3, 7))}
grid_search_plot_models_2d_clustering(spectral_estimator, spectral_grid, X)
grid_search_plot_one_parameter_curves_clustering(spectral_estimator,
                                                 spectral_grid,
                                                 X,
                                                 scoring=scoring)
spectral_final_model = grid_search_best_model_clustering(spectral_estimator,
                                                         spectral_grid,
                                                         X,
                                                         scoring=scoring)
plot_model_2d_clustering(spectral_final_model, X)

X, y = generate_synthetic_data_3d_clusters(n_samples=300,
                                           n_centers=5,
                                           cluster_std=0.60)
Пример #16
0
def clustering(Xsvd,
               cells,
               dataset,
               suffix,
               labels=None,
               tlabels=None,
               method='knn',
               istsne=True,
               name='',
               batch_labels=None,
               seed=42):
    tsne = TSNE(n_jobs=24).fit_transform(Xsvd)

    for n_components in [15]:
        if method == 'gmm':
            clf = mixture.GaussianMixture(n_components=n_components).fit(mat)
            labels_pred = clf.predict(tsne)
        elif method == 'knn':
            labels_pred = KMeans(n_components,
                                 n_init=200).fit_predict(tsne)  # n_jobs>1 ?
        elif method == 'dbscan':
            labels_pred = DBSCAN(eps=0.3, min_samples=10).fit(tsne).labels_
        elif method == 'spectral':
            spectral = cluster.SpectralClustering(n_clusters=n_components,
                                                  eigen_solver='arpack',
                                                  affinity="nearest_neighbors")
            labels_pred = spectral.fit_predict(tsne)
        elif method == 'louvain':
            from scipy.spatial import distance

            for louvain in [30]:
                print('****', louvain)
                mat = kneighbors_graph(Xsvd,
                                       louvain,
                                       mode='distance',
                                       include_self=True).todense()

                G = nx.from_numpy_matrix(mat)
                partition = community.best_partition(G, random_state=seed)

                labels_pred = []
                for i in range(mat.shape[0]):
                    labels_pred.append(partition[i])

                labels_pred = np.array(labels_pred)
                print('louvain', louvain, tsne[:5], len(labels),
                      len(labels_pred))
                #print(np.unique(labels_pred))

                if labels is not None:
                    nmi_score = NMI(labels, labels_pred)
                    ari_score = ARI(labels, labels_pred)
                    print(
                        n_components, method,
                        "Clustering Scores:\nNMI: %.4f\nARI: %.4f\n" %
                        (nmi_score, ari_score))

    if istsne:
        n_components = len(np.unique(labels_pred))
        vis_x = tsne[:, 0]
        vis_y = tsne[:, 1]
        colors = [
            'blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink',
            'yellow', 'black', 'teal', 'plum', 'tan', 'bisque', 'beige',
            'slategray', 'brown', 'darkred', 'salmon', 'coral', 'olive',
            'lightpink', 'teal', 'darkcyan', 'BlueViolet', 'CornflowerBlue',
            'DarkKhaki', 'DarkTurquoise'
        ]

        show_tsne(tsne,
                  labels,
                  'result/%s/%s-%s-LSI-true.png' % (dataset, name, suffix),
                  tlabels=tlabels)
        show_tsne(tsne, labels_pred,
                  'result/%s/%s-%s-LSI-pred.png' % (dataset, name, suffix))

        with open('result/%s-LSI-cluster_result.csv' % (dataset), 'w') as f:
            f.write('cell,predicted label,tsne-1,tsne-2\n')
            for cell, pred, t in zip(cells, labels_pred, tsne):
                f.write('%s,%d,%f,%f\n' % (cell, pred, t[0], t[1]))

    if batch_labels is not None:
        show_tsne(
            tsne, batch_labels, 'result/%s/%s-GMVAE-%s-%s-batch.png' %
            (dataset, dataset, suffix, name))
Пример #17
0
import matplotlib.colors as colors

# In[8]:

n_samples = 500
varied = pd.DataFrame(
    datasets.make_blobs(n_samples=n_samples,
                        centers=4,
                        cluster_std=[1.0, 2.5, 1, 1],
                        random_state=5)[0])

# In[9]:

kmeans = cluster.KMeans(n_clusters=4)
ward = cluster.AgglomerativeClustering(n_clusters=4)
spectral = cluster.SpectralClustering()
dbscan = cluster.DBSCAN()
affinity_propagation = cluster.AffinityPropagation()
birch = cluster.Birch(n_clusters=4)
gmm = mixture.GaussianMixture(n_components=4)

# In[10]:

algo = (('kmeans', kmeans), ('Agnes-ward', ward), ('spectral', spectral),
        ('dbscan', dbscan), ('affinity_propagation',
                             affinity_propagation), ('birch', birch))

# In[11]:

label = 'kmeans'
Пример #18
0
#        sig1 = data_V[i,0:150]
#        sig2 = data_V[j,0:150]
#        sigp1 = data_V[i,150:]
#        sigp2 = data_V[j,150:]
#        cc1[i,j] = max(np.correlate(sig1,sig2)/(sum(sig1**2)*sum(sig2**2))**0.5)
#        cc1[j,i] = cc1[i,j]
#        cc2[i,j] = max(np.correlate(sigp1,sigp2)/(sum(sigp1**2)*sum(sigp2**2))**0.5)
#        cc2[j,i] = cc2[i,j]
#

#dis = np.zeros((l,l))
#dis = 1-cc2
#np.save('disM',dis)

from sklearn import cluster
db = cluster.SpectralClustering(n_clusters=10, affinity='precomputed').fit(dis)
labels = db.labels_
print(max(labels))

for j in range(max(labels) + 1):
    ind = np.where(labels == j)[0]
    print(len(ind))
    stack_signal = np.median(data_V[ind, :], axis=0)
    plt.figure()
    for ii in range(len(ind)):
        plt.plot(data_V[ind[ii], 0:150], c=[0, 0, 0.6, 0.1], linewidth=0.5)
        plt.plot(data_V[ind[ii], 150:], c=[0.6, 0, 0, 0.1], linewidth=0.5)

    plt.plot(stack_signal[0:150], 'b')
    plt.plot(data_V[ind[0], 0:150], 'b--')
    plt.plot(stack_signal[150:], 'r')
Пример #19
0
# Load and Store both data and groundtruth of Zachary's Karate Club
G = nx.karate_club_graph()
groundTruth = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# Transform our graph data into matrix form
edgeMat = graphToEdgeMatrix(G)

# Positions the nodes using Fruchterman-Reingold force-directed algorithm
# Too technical to discuss right now, just go with it
pos = nx.spring_layout(G)
drawCommunities(G, listToDict(groundTruth), pos)

# -----------------------------------------

# Spectral Clustering Model
spectral = cluster.SpectralClustering(n_clusters=kClusters, affinity="precomputed", n_init=200)
spectral.fit(edgeMat)

# Transform our data to list form and store them in results list
results.append(list(spectral.labels_))

# -----------------------------------------

# Agglomerative Clustering Model
agglomerative = cluster.AgglomerativeClustering(n_clusters=kClusters, linkage="ward")
agglomerative.fit(edgeMat)

# Transform our data to list form and store them in results list
results.append(list(agglomerative.labels_))

# -----------------------------------------
Пример #20
0
y_true = [
    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1
]
edge_mat = graph_to_edge_matrix(G)
k_clusters = 2
results = []
algorithms = {}

algorithms['kmeans'] = cluster.KMeans(n_clusters=k_clusters, n_init=200)

algorithms['agglom'] = cluster.AgglomerativeClustering(n_clusters=k_clusters,
                                                       linkage="ward")

algorithms['spectral'] = cluster.SpectralClustering(n_clusters=k_clusters,
                                                    affinity="precomputed",
                                                    n_init=200)

algorithms['affinity'] = cluster.AffinityPropagation(damping=0.6)

for model in algorithms.values():
    model.fit(edge_mat)
    results.append(list(model.labels_))

kmeans = cluster.KMeans(n_clusters=k_clusters, n_init=200)
kmeans.fit(graph_to_edge_matrix(G))

# Transform our data to list form and store them in results list
results.append(list(kmeans.labels_))

draw_communities(G, list(kmeans.labels_), pos)
Пример #21
0
  def _cluster(self, acts, method='KM', param_dict=None):
    """Runs unsupervised clustering algorithm on concept actiavtations.

    Args:
      acts: activation vectors of datapoints points in the bottleneck layer.
        E.g. (number of clusters,) for Kmeans
      method: clustering method. We have:
        'KM': Kmeans Clustering
        'AP': Affinity Propagation
        'SC': Spectral Clustering
        'MS': Mean Shift clustering
        'DB': DBSCAN clustering method
      param_dict: Contains superpixl method's parameters. If an empty dict is
                 given, default parameters are used.

    Returns:
      asg: The cluster assignment label of each data points
      cost: The clustering cost of each data point
      centers: The cluster centers. For methods like Affinity Propagetion
      where they do not return a cluster center or a clustering cost, it
      calculates the medoid as the center  and returns distance to center as
      each data points clustering cost.

    Raises:
      ValueError: if the clustering method is invalid.
    """
    if param_dict is None:
      param_dict = {}
    centers = None
    if method == 'KM':
      n_clusters = param_dict.pop('n_clusters', 25)
      km = cluster.KMeans(n_clusters)
      d = km.fit(acts)
      centers = km.cluster_centers_
      d = np.linalg.norm(
          np.expand_dims(acts, 1) - np.expand_dims(centers, 0), ord=2, axis=-1)
      asg, cost = np.argmin(d, -1), np.min(d, -1)
    elif method == 'AP':
      damping = param_dict.pop('damping', 0.5)
      ca = cluster.AffinityPropagation(damping)
      ca.fit(acts)
      centers = ca.cluster_centers_
      d = np.linalg.norm(
          np.expand_dims(acts, 1) - np.expand_dims(centers, 0), ord=2, axis=-1)
      asg, cost = np.argmin(d, -1), np.min(d, -1)
    elif method == 'MS':
      ms = cluster.MeanShift(n_jobs=self.num_workers)
      asg = ms.fit_predict(acts)
    elif method == 'SC':
      n_clusters = param_dict.pop('n_clusters', 25)
      sc = cluster.SpectralClustering(
          n_clusters=n_clusters, n_jobs=self.num_workers)
      asg = sc.fit_predict(acts)
    elif method == 'DB':
      eps = param_dict.pop('eps', 0.5)
      min_samples = param_dict.pop('min_samples', 20)
      sc = cluster.DBSCAN(eps, min_samples, n_jobs=self.num_workers)
      asg = sc.fit_predict(acts)
    else:
      raise ValueError('Invalid Clustering Method!')
    if centers is None:  ## If clustering returned cluster centers, use medoids
      centers = np.zeros((asg.max() + 1, acts.shape[1]))
      cost = np.zeros(len(acts))
      for cluster_label in range(asg.max() + 1):
        cluster_idxs = np.where(asg == cluster_label)[0]
        cluster_points = acts[cluster_idxs]
        pw_distances = metrics.euclidean_distances(cluster_points)
        centers[cluster_label] = cluster_points[np.argmin(
            np.sum(pw_distances, -1))]
        cost[cluster_idxs] = np.linalg.norm(
            acts[cluster_idxs] - np.expand_dims(centers[cluster_label], 0),
            ord=2,
            axis=-1)
    return asg, cost, centers
Пример #22
0
def cluster_business(businesses):
    NClusters = 50
    np.random.seed(0)

    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    n_samples = 1500
    noisy_circles = datasets.make_circles(n_samples=n_samples,
                                          factor=.5,
                                          noise=.05)
    noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
    blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
    colors = np.hstack([colors] * 20)

    plt.figure(1)
    plt.subplots_adjust(left=.001,
                        right=.999,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)

    plot_num = 1
    X = np.ndarray(shape=(0, 2))
    count = 0
    for b in businesses:
        X = vstack([X, [b.longitude, b.latitude]])
        # if(count>1000):
        # 	break
        count += 1
    # print type(X)
    # print X

    k_means = cluster.MiniBatchKMeans(n_clusters=NClusters)
    dbscan = cluster.DBSCAN(eps=.03)
    affinity_propagation = cluster.AffinityPropagation(damping=.9,
                                                       preference=-200)
    spectral = cluster.SpectralClustering(n_clusters=2,
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")

    for name, algorithm in [
        ('MiniBatchKMeans', k_means),
            # ('DBSCAN', dbscan),
            # ('SpectralClustering', spectral)
            # ('AffinityPropagation', affinity_propagation),
    ]:
        # predict cluster memberships
        t0 = time.time()
        algorithm.fit(X)
        t1 = time.time()
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)

        # plot
        ax = plt.subplot(1, 2, plot_num)
        plt.title(name, size=16)
        plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

        if hasattr(algorithm, 'cluster_centers_'):
            centers = algorithm.cluster_centers_
            center_colors = colors[:len(centers)]
            plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
        ax.spines["top"].set_visible(False)
        ax.spines["bottom"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.spines["left"].set_visible(False)
        plt.xticks(())
        plt.yticks(())
        plot_num += 1
    plt.show()

    clusters = []
    for index in range(NClusters):
        clusters.append(Cluster([]))
    for index in range(len(businesses)):
        businesses[index].cluster_id = y_pred[index]
        clusters[y_pred[index]].businesses.append(businesses[index])
    return clusters
Пример #23
0
def spectral(feat, n_clusters, **kwargs):
    spectral = cluster.SpectralClustering(n_clusters=n_clusters,
                                          assign_labels="discretize",
                                          affinity="nearest_neighbors",
                                          random_state=0).fit(feat)
    return spectral.labels_
Пример #24
0
def cluster_annotation(long_turns,
                       embeddings,
                       speakers,
                       algorithm='SpectralClustering'):
    X = []
    for segment in long_turns:
        # "strict" only keeps embedding strictly included in segment
        x = embeddings.crop(segment, mode='strict')
        # average speech turn embedding
        X.append(np.mean(x, axis=0))

    X = np.vstack(X)

    # apply PCA on embeddings
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    X = imp.fit_transform(X)

    if (X.shape[1] == 0):
        return Annotation(), [], []

    no_clusters = int(speakers)

    if no_clusters == 0:
        range_n_clusters = list(range(2, 10))

        silhouette_dict = {}

        for n_clusters in range_n_clusters:
            clusterer = cluster.SpectralClustering(n_clusters=n_clusters)
            cluster_labels = clusterer.fit_predict(X)
            silhouette_avg = silhouette_score(X, cluster_labels)
            print("For n_clusters =", n_clusters,
                  "The average silhouette_score is :", silhouette_avg)
            silhouette_dict[n_clusters] = silhouette_avg

        if (all(value == 0 for value in silhouette_dict.values())):
            no_clusters = 2
        else:
            max_val = 0
            max_index = 0
            for clusters in silhouette_dict:
                if (silhouette_dict[clusters] > max_val):
                    max_val = silhouette_dict[clusters]
                    max_index = clusters

            no_clusters = max_index

    c = select_cluster_algorithm(algorithm, no_clusters)
    labels = c.fit_predict(X)

    labeled_data = []
    for i, turn in enumerate(long_turns):
        labeled_data.append([labels[i], turn])

    annotation = Annotation()
    for i in labeled_data:
        label = int(i[0])
        segment = i[1]
        annotation[segment] = label

    return annotation
    # remove duplicate entities detected
    entity_text_array = np.unique(entity_text_array)

    # Construct TfidVectorizer
    vect = TfidfVectorizer(sublinear_tf=True,
                           max_df=0.5,
                           analyzer='word',
                           stop_words='english',
                           vocabulary=entity_text_array)
    corpus_tf_idf = vect.fit_transform(corpus)

    # change n_clusters to equal the number of clusters desired
    n_clusters = 7
    n_components = n_clusters
    #spectral clustering
    spectral = cluster.SpectralClustering(n_clusters=n_clusters,
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors",
                                          n_neighbors=17)
    spectral.fit(corpus_tf_idf)

    if hasattr(spectral, 'labels_'):
        cluster_assignments = spectral.labels_.astype(np.int)
    for i in range(0, 40):  #len(cluster_assignments))
        # removed topic cluster here because the site I used (yahoo)
        # didn't have very good topics by default
        print('Document number : {}'.format(i))
        print('Cluster Assignment : {}'.format(cluster_assignments[i]))
        print('Document title : {}'.format(titles_array[i]))
        print('------------------------')
Пример #26
0
evoked = epochs['1'].average()
EVOKED = evoked

EPOCHS, EVOKED

# %%
CHANNELS = EPOCHS.info['chs']
TIMES = EPOCHS.times

# %%
x = EVOKED.data
x_embedded = TSNE(n_components=2).fit_transform(x)

n_clusters = N_CLUSTERS
spectral = CLUSTER.SpectralClustering(n_clusters=n_clusters,
                                      eigen_solver='arpack',
                                      affinity='nearest_neighbors')
labels = spectral.fit_predict(x)

fig, ax = plt.subplots()
for label in np.unique(labels):
    ax.scatter(x_embedded[labels == label, 0], x_embedded[labels == label, 1])
FIGURES.append(fig)

times = np.array([0.2, 0.3, 0.4, 0.5, 0.6])

evoked_labels = EVOKED.copy()
evoked_labels.data = evoked_labels.data * 0

for label in np.unique(labels):
    print(label)
Пример #27
0
from pandas.core.frame import DataFrame
import numpy as np
from sklearn import preprocessing

# 先把資料讀進來
data = pd.read_csv('./data.csv').values[:,1:]
test_data = pd.read_csv('./test.csv').values[:,1:]

# 把數字正規化
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
data = scaler.fit_transform(data)
data = pd.DataFrame(data)

# 使用四種不同的方式來分類,然後記錄下來
labels = []
labels.append(cluster.SpectralClustering(n_clusters = 6 ,random_state = 1, affinity='rbf', gamma = 0.3, n_init=100).fit(data).labels_)
labels.append(cluster.AgglomerativeClustering(n_clusters = 5, linkage = 'ward', compute_full_tree= True).fit(data).labels_)
labels.append(cluster.KMeans(n_clusters = 3, init = 'k-means++', n_init = 100, max_iter = 30000, tol = 1e-4, random_state=1, precompute_distances=True).fit(data).labels_)
labels.append(cluster.MiniBatchKMeans(n_clusters = 6, n_init = 100).fit(data).labels_)

# 最後來投票,只要大於等於2的就算同類
ans = np.zeros((len(test_data), len(labels)))
for i in range(len(test_data)):
    for j in range(len(labels)):
        if(labels[j][test_data[i][0]] == labels[j][test_data[i][1]]):
            ans[i][j] = 1
ans = np.array(ans)
ans = np.sum(ans, axis = 1)
ans = (ans) > 1

# 輸出結果
for element in context_name_ent:
   avg_ent = []
   list_avg = element + named_entity[i][3:].split(" ")
   for item in list_avg:
      try:
         avg_ent.append(model[item])
      except:
         pass
   if avg_ent != []:
      avg_ent = np.array(avg_ent)
      avg_ent = np.mean(avg_ent, axis = 0)
      entity_embedding.append(avg_ent)
      for element in list_avg:
         thefile.write("%s\t" % element.encode('utf-8'))
      thefile.write("%s\n" % named_entity[i][:3].encode('utf-8')) 
   i +=1

entity_embedding =np.array(entity_embedding)
print(len(named_entity_f))
#Spectral CLustering 
pickle.dump(named_entity_f, open("true_labels.p", "wb"))
print("starting spectral")
spectral = cluster.SpectralClustering(n_clusters=10, eigen_solver='arpack', n_init=1)#, affinity="nearest_neighbors"
spectral.fit(entity_embedding)
print spectral.labels_

pickle.dump(spectral.labels_, open("predicted_labels.p", "wb"))
#labels = pickle.load(open("labels.p", "rb"))


Пример #29
0
        i = 0
        for line in f:
            similarityMatrix[i] = line.split(",")[:-1]
            i += 1

    # Make the matrix symmetric
    print("Making matrix symmetric")
    for i in range(len(ciks)):
        similarityMatrix[i][i] = 1
        for j in range(i):
            similarityMatrix[i][j] = similarityMatrix[j][i]

    # TODO Clustering
    print("Clustering")
    mat = np.matrix(similarityMatrix).astype(np.float64)

    #eigen_values, eigen_vectors = np.linalg.eigh(mat)
    #result = cluster.KMeans(n_clusters=200, init='k-means++').fit_predict(eigen_vectors[:, 2:4])

    #result = cluster.DBSCAN().fit_predict(mat)
    result = cluster.SpectralClustering(300).fit_predict(mat)

    print(result)
    with open(join(inpath, "ClusterResult.txt"), 'w') as f:
        for cik in ciks:
            f.write("%s," % (cik))
        f.write("\n")
        for i in range(len(result)):
            f.write("%s," % (result[i]))
        f.write("\n")
Пример #30
0
def spectral(X):
    return cluster.SpectralClustering(
        n_clusters=n_clusters,
        eigen_solver='arpack',
        affinity="nearest_neighbors").fit_predict(X)