Пример #1
0
    def neuron_spectral_cluster_direct(neuron):
        """
        Perform spectral cluster over neurons in the penultimate (conv5_3) layer
        :param neuron:
            [numpy.ndarray] The activation of neurons in the penultimate (conv5_3) layer, shape is (512, 1, 1)
        :return:
            [list] a list, whose element denotes the cluster number of its corresponding neuron, length is 512
        """
        # compute the cosine similarity matrix by
        #           cosine = <A, B> / (|A|*|B|)
        # ui = neuron.squeeze(2)
        # print(ui.shape)
        # uj = torch.t(ui)
        # cosine_similarity_matrix = torch.matmul(ui, uj) / (torch.norm(ui) * torch.norm(uj))  # shape -> (512, 512)

        ui = np.squeeze(neuron)  # shape -> (512, 1)
        uj = np.transpose(ui)  # shape -> (1, 512)
        #cosine_similarity_matrix = np.matmul(ui, uj) / (np.linalg.norm(ui) * np.linalg.norm(uj))  # shape -> (512, 512)
        cosine_similarity_matrix = np.matmul(ui, uj) / (np.matmul(
            np.linalg.norm(ui, 2, axis=1), np.linalg.norm(
                uj, 2, axis=0)))  # shape -> (512, 512)

        cosine_similarity_matrix = np.exp(cosine_similarity_matrix)
        # Perform spectral clustering on the similarity matrix
        sc = SpectralClustering(n_clusters=2,
                                affinity='precomputed',
                                n_init=100)
        sc.fit(cosine_similarity_matrix)
        #cluster_index = list(sc.labels_)
        cluster_index = sc.labels_
        # print(cluster_index)
        assert len(cluster_index) == 512, 'error'
        return cluster_index
Пример #2
0
class spectralClustering(BaseEstimator, ClusterMixin, TransformerMixin):
    def __init__(self, n_clusters=2, gamma=1, n_neighbors=10):
        self.k = n_clusters
        self.gamma = gamma
        self.n_neighbors = n_neighbors

    def fit(self, X, y=None):

        self.cluster = SpectralClustering(n_clusters=self.k)
        self.cluster.fit(X)

        return self

    def predict(self, X):

        return self.cluster.fit_predict(X)

    def get_params(self, deep=True):

        return {
            "n_clusters": self.k,
            "gamma": self.gamma,
            "n_neighbors": self.n_neighbors
        }

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
Пример #3
0
def psc_distance_matrix(distance='spearman', clustering='spectral'):
    for i in range(0, df.shape[1]):
        for j in range(0, df.shape[1]):
            #Spearman correlation
            if distance == 'spearman':
                dist_mat.at[df.columns[i], df.columns[j]] = abs(
                    round(
                        scipy.stats.spearmanr(
                            np.array(df.iloc[:, i]).astype(float),
                            np.array(df.iloc[:, j]).astype(float))[0], 4))
            #Euclidean distance
            else:
                dist_mat.at[df.columns[i], df.columns[j]] = np.linalg.norm(
                    np.array(df.iloc[:, i]).astype(float) -
                    np.array(df.iloc[:, j]).astype(float))
    if clustering == 'spectral':
        clustering = SpectralClustering(n_clusters=2,
                                        affinity='precomputed',
                                        assign_labels='discretize',
                                        random_state=0)
    else:
        clustering = AgglomerativeClustering(affinity='precomputed',
                                             linkage='average')
    clustering.fit(dist_mat.values)

    bact_label = {0: [], 1: []}

    for i in range(0, df.shape[1]):
        bact_label[clustering.labels_[i]].append(df.columns[i])

    df0 = df[bact_label[0]]
    df1 = df[bact_label[1]]

    pca_and_conf_matrix_per_group(df0)
    pca_and_conf_matrix_per_group(df1)
Пример #4
0
def spectral_cluster_combined(data: np.ndarray, epi_data: np.ndarray,
                              num_clusters: int):
    #dist = squareform(pdist(data,'correlation'))*squareform(pdist(epi_data,'canberra'))
    dist = squareform(pdist(epi_data, 'canberra'))
    spec = SpectralClustering(n_clusters=num_clusters, affinity="precomputed")
    spec.fit(dist)
    return binarize_vector(spec.labels_, num_clusters)
Пример #5
0
def call_spectral(num_cluster ,mode_, data, update_flag):
    X = StandardScaler().fit_transform(data)
    spectral = SpectralClustering(n_clusters=num_cluster, eigen_solver='arpack', 
                                                        affinity='precomputed')
    connectivity = kneighbors_graph(X, n_neighbors=10)
    connectivity = 0.5 * (connectivity + connectivity.T)
    spectral.fit(connectivity)
    labels = spectral.labels_

    if update_flag:
        return labels


    label_dict = {}
    label_dict_count = 0
    for label in labels:
       label_dict[str(label_dict_count)] = float(label)
       label_dict_count = label_dict_count + 1
    print label_dict

    unique_dict = {}
    unique_dict_count = 0
    for uniq in np.unique(labels):
       print uniq
       unique_dict[str(unique_dict_count)] = float(uniq)
       unique_dict_count = unique_dict_count + 1
    print unique_dict

    return label_dict, unique_dict
Пример #6
0
def SpectralClusteringAlgorithm(X, k):
    #参数n_clusters: integer, optional
    #The dimension of the projection subspace.
    sc = SpectralClustering(n_clusters=k)
    sc.fit(X)
    y_pred = sc.labels_
    return y_pred
Пример #7
0
def e2cp_fit(similarity_matrix, ML, CL, n_clusters):
    """
    apply constraint-propagation clustering e2cp on a given matrix.

    :param similarity_matrix: similarity matrix or affinity matrix of the dataset
    :param ML: must-link constraint set at the format of [[xx, yy], [yy, zz] .... ]
    :param CL: cannot-link constraint set
    :param n_clusters: #clusters
    :return:
    """
    N = similarity_matrix.shape[0]

    nbrs = NearestNeighbors(n_neighbors=_k_E2CP + 1,
                            algorithm='brute').fit(similarity_matrix)
    distances, indices = nbrs.kneighbors()
    W = np.zeros(similarity_matrix.shape)

    ind1 = (np.arange(N).reshape((-1, 1)) * np.ones(
        (1, _k_E2CP))).reshape(-1).astype('int')
    ind2 = indices[:, 1:].reshape(-1).astype('int')
    W[ind1, ind2] = similarity_matrix[ind1, ind2] / (
        np.sqrt(similarity_matrix[ind1, ind1]) *
        np.sqrt(similarity_matrix[ind2, ind2]))

    W = (W + W.transpose()) / 2
    Dsqrt = np.diag(np.sum(W, axis=1)**-0.5)
    Lbar = np.dot(np.dot(Dsqrt, W), Dsqrt)

    Z = np.zeros(similarity_matrix)
    Z[ML[:, 0], ML[:, 1]] = 1
    Z[CL[:, 0], CL[:, 1]] = -1

    # iterative approach
    # Fv = np.zeros(Z.shape)
    # for i in range(50):
    #     Fv = self.alpha * np.dot(Lbar, Fv) + (1 - self.alpha) * Z
    #
    # Fh = np.zeros(Z.shape)
    # for i in range(50):
    #     Fh = self.alpha * np.dot(Fh, Lbar) + (1 - self.alpha) * Fv
    #
    # Fbar = Fh / np.max(np.abs(Fh.reshape(-1)))

    # approximation of Fbar instead of the propagation iteration.
    temp = (1 - _alpha) * (np.eye(Lbar.shape[0]) - _alpha * Lbar)
    Fbar = np.dot(np.dot(temp, Z), temp.conj().T)

    Fbar = Fbar / np.max(np.abs(Fbar.reshape(-1)))

    # recover
    Wbar = np.zeros(similarity_matrix)
    mlInd = Fbar >= 0
    Wbar[mlInd] = 1 - (1 - Fbar[mlInd]) * (1 - W[mlInd])
    clInd = Fbar < 0
    Wbar[clInd] = (1 + Fbar[clInd]) * W[clInd]

    specClus = SpectralClustering(n_clusters=n_clusters,
                                  affinity='precomputed')
    specClus.fit(Wbar)
    return specClus.labels_
def spectral_clustering(G, graph_name, num_clusters):
    #Find a way to figure out clusters number automatically
    subgraphs = []
    write_directory = os.path.join(Constants.SPECTRAL_PATH,graph_name)
    if not os.path.exists(write_directory):
        os.makedirs(write_directory)
    nodeList = G.nodes()
    matrix_data = nx.to_numpy_matrix(G, nodelist = nodeList)
    spectral = SpectralClustering(n_clusters=2,
                                          eigen_solver='arpack',
                                          affinity="rbf")   
    spectral.fit(matrix_data)
    label = spectral.labels_
    clusters = {}
    
    for nodeIndex, nodeLabel in enumerate(label):
        if nodeLabel not in clusters:
            clusters[nodeLabel] = []
        clusters[nodeLabel].append(nodeList[nodeIndex])
        
    #countNodes is used to test whether we have all the nodes in the clusters 
   
    for clusterIndex, subGraphNodes in enumerate(clusters.keys()):
        subgraph = G.subgraph(clusters[subGraphNodes])
        subgraphs.append(subgraph)
        nx.write_gexf(subgraph, os.path.join(write_directory,graph_name+str(clusterIndex)+"_I"+Constants.GEXF_FORMAT))
        #countNodes = countNodes + len(clusters[subGraphNodes])
    return subgraphs
class SP2CcommunityClassifier():
    def __init__(self,graph):
        self.G=graph
        self.A=to_numpy_matrix(graph)
        self.k=np.sum(self.A,axis=1)
        self.m=np.sum(self.k)/2
        self.B=self.A-np.dot(self.k,self.k.transpose())/(2*self.m)
        self.sc = SpectralClustering(2, affinity='precomputed')
        self.Q=0
        self.category={node:[] for node in graph.nodes}
        self.s=None
        self.G_positive=None
        self.G_negative=None
        self.done=False

    def fit(self):
      self.sc.fit(self.A)
      #self.sc.labels_
      rows=list(zip(self.sc.labels_,list(self.G.nodes)))
      d = defaultdict(list)
      for k, v in rows: 
        d[k].append(v)
      partitions=list(d.values())
      ll=[]
      for i in self.sc.labels_:
        label=[2*int(h)-1 for h in list(bin(i)[2:])]
        ll.append(label)
      self.category=dict(zip(list(self.G.nodes),ll))
      self.s=np.array([self.category[node][0] for node in self.G.nodes])
      nodes=np.array(self.G.nodes)
      self.G_positive=self.G.subgraph(nodes[self.s==1])
      self.G_negative=self.G.subgraph(nodes[self.s==-1])
      self.Q=np.einsum("i,ij,j",self.s,self.B,self.s)/(4*self.m)
      if self.Q<0:
        self.done=True
Пример #10
0
def spectral_cluster():  #谱聚类

    adj_mat, unidata = get_data()
    cluster_num = 2
    #sc = SpectralClustering( cluster_num , affinity='precomputed', n_init=3000, assign_labels='discretize')
    sc = SpectralClustering(cluster_num,
                            affinity='precomputed',
                            n_init=3000,
                            assign_labels='discretize')
    sc.fit(adj_mat)
    # Compare ground-truth and clustering-results
    print('spectral clustering')
    #print(sc.labels_)#输出标签
    print('sc长度', len(sc.labels_))

    class_array = [[100 for i in range(0)] for j in range(cluster_num)]
    class_length = np.zeros(cluster_num)
    for ci in range(cluster_num):
        for scj in range((len(sc.labels_))):
            if sc.labels_[scj] == ci:
                filenumber = scj
                class_array[ci].append(filenumber)
                class_length[ci] = class_length[ci] + 1
    for ci in range(cluster_num):
        print('类编号 = ', ci, '类个数 =', class_length[ci])
        print('类序号 = ', class_array[ci])
        print('-----------------------------------')
    # Calculate some clustering metrics
    for i in range(len(sc.labels_)):
        srcfile = './13D归一化图像/' + str(i + 1) + '.jpeg'
        dstfile = './13D分类/' + str(sc.labels_[i]) + '/' + str(i + 1) + '.jpeg'
        mycopyfile(srcfile, dstfile)
def prepare_spectral_clustering_features(X, n_clusters):
    '''
    Inputs:
        X: data matrix or dataframe. Each data instance is expected to be a row
            in the matrix or dataframe
        n_cluster: number of clusters
    Outputs:
        return: returns a one-hot vector encoding of the clusters. For example, if 
        there are 6 data points, belonging to clusters [0,0,1,1,2,2], then the return
        array will be
        [1,0,0]
        [1,0,0]
        [0,1,0]
        [0,1,0]
        [0,0,1]
        [0,0,1]
    '''
    cluster_model = SpectralClustering(n_clusters=n_clusters,
                                       n_init = 10,
                                       assign_labels="discretize",
                                       random_state=0)
    cluster_model.fit(X)
    labels_vec = cluster_model.labels_
    labels_vec = np.reshape(labels_vec,(len(labels_vec),1),'F')
    enc = OneHotEncoder(handle_unknown='error')
    enc.fit(labels_vec)
    one_hot_vec = enc.transform(labels_vec)
    return one_hot_vec
Пример #12
0
def sklearn_test2():
    read_path = 'F:\\result2019-2\\result0812\\datasets\\Wine\\'
    data_reader = np.loadtxt(read_path + 'data.csv',
                             dtype=np.str,
                             delimiter=',')
    label_reader = np.loadtxt(read_path + 'label.csv',
                              dtype=np.str,
                              delimiter=',')
    X = data_reader[:, :].astype(np.float)
    label_true = label_reader.astype(np.int)

    X = PreProcess.normalize(X)
    (n, dim) = X.shape
    k = 3
    delta = 1.0

    sc = SpectralClustering(n_clusters=k)
    sc.fit(X)
    label = sc.labels_

    pca = PCA.PCA(X, 2)
    Y = pca.fit_transform()

    colors = ['c', 'm', 'y', 'b', 'r', 'g']
    shapes = ['s', 'o', '^', 'p', '+', '*']
    for i in range(0, n):
        plt.scatter(Y[i, 0],
                    Y[i, 1],
                    c=colors[int(label[i])],
                    marker=shapes[int(label_true[i])])

    plt.show()
Пример #13
0
def SepectralClustering(data, actualLabels):
    pca = PCA(n_components=2).fit(data)
    pca_2d = pca.transform(data)
    spectral = SpectralClustering(n_clusters=10,
                                  eigen_solver='arpack',
                                  affinity="nearest_neighbors")
    t0 = time()
    spectral.fit(pca_2d)
    print('% 9s' % 'init'
          '    time   h**o   compl  v-meas     ARI AMI  silhouette')
    print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f' %
          ('Spectral', (time() - t0),
           metrics.homogeneity_score(actualLabels, spectral.labels_),
           metrics.completeness_score(actualLabels, spectral.labels_),
           metrics.v_measure_score(actualLabels, spectral.labels_),
           metrics.adjusted_rand_score(actualLabels, spectral.labels_),
           metrics.adjusted_mutual_info_score(actualLabels, spectral.labels_),
           metrics.silhouette_score(
               data, spectral.labels_, metric='euclidean', sample_size=10000)))

    print spectral.labels_
    print len(np.unique(spectral.labels_))
    colors = np.random.rand(15)
    scatter = plt.scatter(pca_2d[:, 0],
                          pca_2d[:, 1],
                          c=spectral.labels_,
                          marker='*')

    plt.colorbar(scatter)

    plt.title('Spectral Clustering')
    plt.show()
def run_SpectralClustering(args):
    [propagated_profile_pca, n_clusters] = args[:2]
    cluster = SpectralClustering(affinity='nearest_neighbors', n_clusters=n_clusters, n_init=1000, gamma=0.5, 
                                 n_neighbors=170, assign_labels='discretize')
    cluster.fit(propagated_profile_pca)
#    print "Calinski-Harabasz Score with n_clusters=", n_clusters,"score:", metrics.calinski_harabaz_score(propagated_profile_pca, cluster.labels_) 
    return cluster.labels_
def fast_app_spe_cluster(data, label, k, n_cluster):
    #k-means get the representative points(centers points)
    start_time = time.clock()
    k_means = KMeans(n_clusters=k)
    k_means.fit(data)
    y_centers = k_means.cluster_centers_
    # get the correspondence table
    x_to_centers_table = list()
    m = len(data)
    for i in range(m):
        min_distance = np.inf
        min_index = None
        for j in range(k):
            i_j_dis = np.sum((data[i, :] - y_centers[j, :]) ** 2)
            if min_distance > i_j_dis:
                min_index = j
                min_distance = i_j_dis
        x_to_centers_table.append(min_index)
    # spectral cluster
    spe_cluster = SpectralClustering(n_clusters=n_cluster)
    spe_cluster.fit(y_centers)
    spe_label = spe_cluster.labels_
    # get m-way cluster membership
    x_label = list()
    for i in range(m):
        x_label.append(spe_label[x_to_centers_table[i]])
    spend_time = time.clock() - start_time
    print("spend time is %f seconds" % spend_time)
    return x_label
def main(cm_file, perm_file, steps, labels_file, limit_classes=None):
    """Run optimization and generate output."""
    # Load confusion matrix
    with open(cm_file) as f:
        cm = json.load(f)
        cm = np.array(cm)

    # Load labels
    if os.path.isfile(labels_file):
        with open(labels_file, "r") as f:
            labels = json.load(f)
    else:
        labels = list(range(len(cm)))

    n_clusters = 14  # hyperparameter
    spectral = SpectralClustering(n_clusters=n_clusters,
                                  eigen_solver='arpack',
                                  affinity="nearest_neighbors")
    spectral.fit(cm)
    if hasattr(spectral, 'labels_'):
        y_pred = spectral.labels_.astype(np.int)
    else:
        y_pred = spectral.predict(cm)
    sscore = silhouette_score(cm, y_pred)
    print("silhouette_score={} with {} clusters"
          .format(sscore, n_clusters))
    grouping = [[] for _ in range(n_clusters)]
    for label, y in zip(labels, y_pred):
        grouping[y].append(label)
    for group in grouping:
        print("  {}: {}".format(len(group), group))
Пример #17
0
def run():

    #generate synthetic data
    x, y, w, beta = sd(n=1000, p=100, k=1, sp_beta=0.8, sp_alpha=0.8)
    tr_x, tst_x, tr_y, tst_y = cv.train_test_split(x, y, test_size=0.2)

    #Train the fhim model.
    fhim = FHIM(lbd_beta=100, lbd_alpha=100)
    fhim.fit(tr_x, tst_x, tr_y, tst_y, KK=1, debug=True)

    #print np.min(fhim.a)
    #fhim.a[fhim.a < 0] = 0
    ww = np.dot(fhim.a, fhim.a.T)
    w_pos = (w - np.min(w)) / (np.max(w) - np.min(w))

    #cluster w
    sc = SpectralClustering(affinity='precomputed')
    sc.fit(w_pos)
    #wc = np.array()
    #wc = np.vstack([w[sc.labels_ == i, sc.labels_ == i] for i in np.unique(sc.labels_)])

    wc = np.zeros(w.shape)
    count = 0
    for i in np.unique(sc.labels_):
        wc[count:count + np.sum(sc.labels_ == i), :] = w[sc.labels_ == i, :]
        count += np.sum(sc.labels_ == i)
    count = 0
    for i in np.unique(sc.labels_):
        wc[:, count:count + np.sum(sc.labels_ == i)] = w[:, sc.labels_ == i]
        count += np.sum(sc.labels_ == i)

    wwc = np.zeros(w.shape)
    count = 0
    for i in np.unique(sc.labels_):
        wwc[count:count + np.sum(sc.labels_ == i), :] = ww[sc.labels_ == i, :]
        count += np.sum(sc.labels_ == i)
    count = 0
    for i in np.unique(sc.labels_):
        wwc[:, count:count + np.sum(sc.labels_ == i)] = ww[:, sc.labels_ == i]
        count += np.sum(sc.labels_ == i)

    cmap = mcolors.ListedColormap([(0, 0, 1), (0, 1, 0), (1, 0, 0)])

    plt.set_cmap('bwr')
    #plt.subplot(121)
    plt.title("Groundtruth Interaction Effects", fontsize=20)
    plt.grid(True)
    plt.imshow(w, vmin=-5, vmax=5)  #, cmap=cmap)
    plt.colorbar()
    plt.show()

    #plt.subplot(221)
    plt.title("Learnt Interaction Effects", fontsize=20)
    plt.imshow(ww, vmin=-5, vmax=5)  #, cmap=cmap)
    plt.grid(True)
    #plt.colormap()
    plt.colorbar()
    plt.show()

    return
Пример #18
0
    def spectralClustering(self, similarity_measure_list, n_clusters=2):
        sim_dict = {}
        edge_set = set()
        for (file1, file2, val) in similarity_measure_list:
            sim_dict[(file1, file2)] = val
            sim_dict[(file2, file1)] = val
            edge_set.add(file1)
            edge_set.add(file2)

        edge_list = list(edge_set)
        affinity_matrix = []
        for edge_id_x in xrange(len(edge_list)):
            temp = []
            for edge_id_y in xrange(len(edge_list)):
                try:
                    temp.append(sim_dict[(edge_list[edge_id_x],
                                          edge_list[edge_id_y])])
                except:
                    temp.append(0)
            affinity_matrix.append(temp)
        affinity_matrix = np.array(affinity_matrix)

        sc = SpectralClustering(n_clusters, affinity='precomputed', n_init=100)
        sc.fit(affinity_matrix)
        labels = sc.labels_
        n_cluster = len(set(labels))
        cluster_set = []
        for x in xrange(n_cluster):
            cluster_set.append([])

        for x in xrange(len(labels)):
            cluster_set[labels[x]].append(edge_list[x])

        #self.cluster_set = cluster_set
        return cluster_set
Пример #19
0
    def detect_regions(self, users):
        '''
        Performs Spectral clustering on geo_coordinates
        :param users: geo-coordinates of all customers' locations.
        :return: dict of clusters: datapoints
        '''
        self.logger.debug("Clustering settlements")

        affinity_matrix = self.get_affinity_matrix(users, k=100)

        nb_clusters, eigenvalues = self.eigen_decomposition(affinity_matrix,
                                                            topK=50)

        K = nb_clusters * 1  # Adjustment factor
        self.logger.debug("Optimal K for Region Clustering " + str(K))

        region_clustering = SpectralClustering(n_clusters=K,
                                               random_state=0,
                                               affinity='precomputed')
        region_clustering.fit(affinity_matrix)

        # Explicitly deleting the affinity matrix due to mem leak issues
        del affinity_matrix

        self.regions = self.format_regions(region_clustering.labels_, users)

        return self.regions
Пример #20
0
	def cluster(self, k):
		sc = SpectralClustering(k, affinity='precomputed', n_init=100)
		sc.fit(self.adj_mat)
		
		print('spectral clustering')
		print(sc.labels_)
		print(len(sc.labels_))
		if not self.is_large_network:
			for cluster_id in range(0, k):
				cluster_nodes = []
				for idx in range(0, len(sc.labels_)):
					if sc.labels_[idx] == cluster_id:
						cluster_nodes.append(self.node_id_to_source_map[self.index_to_node[idx]])
				print str(cluster_id) + ":"
				print "\t" + str(cluster_nodes)

	    # get ground truth for node (only node ids with labels)
		self.gt = []
		labeled_node_indices = []
		for idx in range(0, len(self.index_to_node)):
			node_source = self.node_id_to_source_map[self.index_to_node[idx]]
			node_source_trust_score = self.get_trust_score_for_source(node_source)
			if node_source_trust_score is not None:
				labeled_node_indices.append(idx)
				print node_source_trust_score
				rounded_trust_score = round(node_source_trust_score, 1)
				self.gt.append(10 * rounded_trust_score)

		labeled_sc_labels = []
		for idx in labeled_node_indices:
			labeled_sc_labels.append(sc.labels_[idx])

		print "AMI metrics:{}".format(metrics.adjusted_mutual_info_score(self.gt, labeled_sc_labels))
Пример #21
0
def main():
    '''Finds related artists to an input artist and constructs a clustered graph around them'''
    # Read command line args
    parser = argparse.ArgumentParser(
        description="Builds a graph of related artists colored by genre")
    parser.add_argument("artist",
                        help="The artist to construct the graph around")
    parser.add_argument("num_artists",
                        type=int,
                        help="Number of artists to include in the graph")
    parser.add_argument("num_clusters",
                        type=int,
                        help="Number of clusters to show in the graph")
    args = parser.parse_args()

    # Get artist info and build graph
    related, info = get_artists(args.artist, args.num_artists)
    artist_graph = build_graph(related)
    # Spectral clustering
    adj_mat = nx.to_numpy_matrix(artist_graph)
    sc = SpectralClustering(args.num_clusters,
                            affinity='precomputed',
                            n_init=100,
                            assign_labels='discretize')
    sc.fit(adj_mat)
    # Draw graph
    show_graph(info, artist_graph, sc)
Пример #22
0
def SC(k, data, parameter):
    eigen_solver = parameter['eigen_solver']
    # n_components = parameter['n_components']
    n_init = parameter['n_init']
    random_state = parameter['random_state']
    gamma = parameter['gamma']
    affinity = parameter['affinity']
    n_neighbors = parameter['n_neighbors']
    eigen_tol = parameter['eigen_tol']
    assign_labels = parameter['assign_labels']
    degree = parameter['degree']
    coef0 = parameter['eigen_tol']
    kernel_params = parameter['kernel_params']
    n_jobs = parameter['n_jobs']

    # SC = SpectralClustering(n_clusters=k, eigen_solver=None, n_components=k-4,
    #                         random_state=1, n_init=10, gamma=0.2, affinity='rbf',
    #                         n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans',
    #                         degree=3, coef0=1, kernel_params=None, n_jobs=None)

    SC = SpectralClustering(n_clusters=k, eigen_solver=eigen_solver,
                            random_state=random_state, n_init=n_init, gamma=gamma, affinity=affinity,
                            n_neighbors=n_neighbors, eigen_tol=eigen_tol, assign_labels=assign_labels,
                            degree=degree, coef0=coef0, kernel_params=kernel_params, n_jobs=n_jobs)

    SC.fit(data)
    labels = SC.fit_predict(data)
    return labels
Пример #23
0
def spectralclustering(params): 
    distance_path=''
    distance_path+=params["distance_path"]
    print(distance_path)
    distance=np.loadtxt(distance_path,dtype=np.float32)
    print(distance.shape)
    delta=2
    affinity=np.exp(-distance ** 2/ (2. * delta ** 2))

    #using default values, set metric to 'precomputed'
    sp=SpectralClustering(n_clusters=10,affinity='precomputed')
    print(sp)

    sp.fit(affinity)
    #get labels
    labels = sp.labels_

    print(labels,labels.shape)
    #get number of clusters
    no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print(no_clusters,"no_clusters")

    #for i in range(no_clusters):
        #print('Cluster  : ', np.nonzero(labels == i)[0])

    #print(type(labels))
    return_val=tuple(labels.tolist())
    #print(type(return_val))
    return return_val
Пример #24
0
    def split_superinstance(self, si, k):
        data_to_cluster = self.data[np.ix_(si.indices, si.indices)]
        spec = SpectralClustering(k, affinity="precomputed")
        spec.fit(data_to_cluster)
        split_labels = spec.labels_.astype(np.int)

        labels_to_indices = []
        for label in set(split_labels):
            labels_to_indices.append(np.where(split_labels == label))

        training = []
        no_training = []

        for new_si_idx in set(split_labels):
            # go from super instance indices to global ones
            cur_indices = [si.indices[idx] for idx, c in enumerate(split_labels) if c == new_si_idx]

            si_train_indices = [x for x in cur_indices if x in self.train_indices]
            if len(si_train_indices) != 0:
                training.append(SuperInstance_DTW(self.data, cur_indices, self.train_indices, si))
            else:
                no_training.append((cur_indices, get_prototype(self.data, cur_indices)))

        for indices, centroid in no_training:
            closest_train = max(training, key=lambda x: self.data[x.representative_idx, centroid])
            closest_train.indices.extend(indices)

        si.children = training

        return training
def SpectralClusteringFunc(K, dataset, rightdataset):
    cluster = SpectralClustering(n_clusters=K, affinity='cosine')
    cluster.fit(dataset)
    #print(cluster.labels_)

    affinity_matrix = cluster.affinity_matrix_
    k, _, _ = eigenDecomposition(affinity_matrix)
    print(f'Optimal number of clusters are: {k}')

    contingency_matrix = metrics.cluster.contingency_matrix(
        rightdataset, cluster.labels_)
    purity = np.sum(np.amax(contingency_matrix, axis=0)) / len(dataset)
    print("Purity for %d Clusters is: %f" % (K, purity))

    # Gia thn pleiopsifia se kathe cluster
    clustersCategories = []
    for i in range(K):

        if contingency_matrix[0][i] > contingency_matrix[1][i]:
            clustersCategories.append(0)
        else:
            clustersCategories.append(1)

    # Gia to F-Measure
    TotalFMeasure = 0
    for i in range(K):  # Gia kathe K
        TruePositive = 0
        TrueNegative = 0
        FalsePositive = 0
        FalseNegative = 0
        for j in range(len(dataset)):  # Gia kathe paradeigma
            label = cluster.labels_[
                j]  # Krata to label tou paradeigmatos sumfwna me ton kmeans
            if (label != i):  # an den einai idio me to cluster pou eksetazoume
                continue
            else:  # an einai idio
                if rightdataset[j] == clustersCategories[
                        label] and clustersCategories[label] == 1:
                    TruePositive = TruePositive + 1
                elif rightdataset[j] == clustersCategories[
                        label] and clustersCategories[label] == 0:
                    TrueNegative = TrueNegative + 1
                elif rightdataset[j] != clustersCategories[
                        label] and clustersCategories[label] == 1:
                    FalsePositive = FalsePositive + 1
                elif rightdataset[j] != clustersCategories[
                        label] and clustersCategories[label] == 0:
                    FalseNegative = FalseNegative + 1

        if TruePositive != 0 and FalsePositive != 0:
            precision = TruePositive / (TruePositive + FalsePositive)
            recall = TruePositive / (TruePositive + FalseNegative)
            F1 = 2 / ((1 / precision) + (1 / recall))
        else:
            precision = 0
            recall = 0
            F1 = 0

        TotalFMeasure = TotalFMeasure + F1
    print("Total F-Measure for %d Clusters is: %f" % (K, TotalFMeasure))
Пример #26
0
class UmapSpectral:
    def __init__(self,
                 nclust,
                 umapdim=2,
                 umapN=10,
                 umapMd=float(0),
                 umapMetric='euclidean',
                 random_state=0):
        self.nclust = nclust
        # change this bit for changing the manifold learner
        self.manifoldInEmbedding = umap.UMAP(random_state=random_state,
                                             metric=umapMetric,
                                             n_components=umapdim,
                                             n_neighbors=umapN,
                                             min_dist=umapMd)
        # change this bit to change the clustering mechanism
        self.clusterManifold = SpectralClustering(n_clusters=nclust,
                                                  affinity='nearest_neighbors',
                                                  random_state=random_state)

        self.hle = None

    def predict(self, hl):
        # obviously if you change the clustering method or the manifold learner
        # youll want to change the predict method too.
        self.hle = self.manifoldInEmbedding.fit_transform(hl)
        self.clusterManifold.fit(self.hle)
        y_pred = self.clusterManifold.fit_predict(self.hle)
        return (y_pred)
Пример #27
0
def trainModel(data, clusterNum):
    model = SpectralClustering(n_clusters=clusterNum,
                               affinity="rbf",
                               gamma=100,
                               assign_labels="kmeans")
    model.fit(data)
    return model
Пример #28
0
def spectral_vader(tweetlist, vectorized_tweets, sim_measure = vader_pos_sim, max_n = 20):
    """Perform spectral clustering with VADER and silhouette analysis."""
    affinity_matrix = vader_affinity_matrix(tweetlist, similarity = sim_measure)
    sil_scr_prev = -1
    brk = 0
    for n in range(2,max_n):
        print 'testing ', n, ' clusters'
        # cluster
        clf = SpectralClustering(n_clusters=n, affinity = 'precomputed')
        clf.fit(affinity_matrix)
        tweet_pred = clf.fit_predict(affinity_matrix)
        # cluster silhouette scores
        silhouette_avg = silhouette_score(vectorized_tweets, tweet_pred)
        print 'Silhouette average ', silhouette_avg

        # determine number of centroids to use for batch
        if silhouette_avg <= sil_scr_prev:
            sil_n = n - 1
            sil_avg = sil_scr_prev
            brk = 1
        # break if previous silhoutte score is smaller
        if brk == 1:
            break
        sil_scr_prev = silhouette_avg
        sil_pred_prev = tweet_pred


    return sil_pred_prev
Пример #29
0
def specclustering():
    np.random.seed(1)

    # Get your mentioned graph
    G = buildGraph()

    fileid = open('Graph.txt', 'w')
    for n, nbrs in G.adjacency_iter():
        for nbr, eattr in nbrs.items():
            data = eattr['weight']
            fileid.write('(%d, %d, %f)\n' % (n, nbr, data))

    fileid.close()


    # Get adjacency-matrix as numpy-array
    adj_mat = nx.adjacency_matrix(G)
    print(adj_mat)

    # Cluster
    sc = SpectralClustering(30, affinity='precomputed', n_neighbors=10, n_init=10)
    sc.fit(adj_mat)
    #
    # Compare ground-truth and clustering-results
    print('spectral clustering')
    clusterfile = open('Cluster.txt', 'w')
    i = 0
    while i<len(G.nodes()):
         clusterfile.write('%d ==> %d\n' % (G.nodes()[i], sc.labels_[i]))
         i = i+1

    clusterfile.close()
    pass
Пример #30
0
def cluster_with_spectral_custering(X):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    spectral_clusterer = SpectralClustering(n_clusters=2)
    spectral_clusterer.fit(X)
    y_pred = spectral_clusterer.labels_
    return y_pred
Пример #31
0
def ibd_distance_matrix(distance='spearman', clustering='spectral'):
    for i in range(0, df.shape[1]):
        for j in range(0, df.shape[1]):
            #Spearman correlation
            if distance == 'spearman':
                dist_mat.at[df.columns[i], df.columns[j]] = abs(
                    round(
                        scipy.stats.spearmanr(
                            np.array(df.iloc[:, i]).astype(float),
                            np.array(df.iloc[:, j]).astype(float))[0], 4))
            #Euclidean distance
            else:
                dist_mat.at[df.columns[i], df.columns[j]] = np.linalg.norm(
                    np.array(df.iloc[:, i]).astype(float) -
                    np.array(df.iloc[:, j]).astype(float))
    if clustering == 'spectral':
        clustering = SpectralClustering(n_clusters=2,
                                        affinity='precomputed',
                                        assign_labels='discretize',
                                        random_state=0)
    else:
        clustering = AgglomerativeClustering(affinity='precomputed',
                                             linkage='average')
    clustering.fit(dist_mat.values)

    bact_label = {0: [], 1: []}

    for i in range(0, df.shape[1]):
        bact_label[clustering.labels_[i]].append(df.columns[i])

    bact_label_name = {0: [], 1: []}
    bact_label_tmp = {0: [], 1: []}
    bact_level = level - 1
    for k in [0, 1]:
        for i in bact_label[k]:
            for key, value in dict_bact.items():
                for j in value:
                    if i == j:
                        bact_label_tmp[k].append(key)
        bact_label_tmp[k] = set(bact_label_tmp[k])
        for i in bact_label_tmp[k]:
            if i != 'else':
                for j in taxonomy:
                    try:
                        if j.split(';')[bact_level] == i:
                            bact_label_name[k].append(','.join(
                                j.split(';')[0:bact_level + 1]))
                            break
                    except:
                        continue
            else:
                bact_label_name[k].append('else')
        bact_label_name[k] = set(bact_label_name[k])
    df0 = df[bact_label[0]]
    df1 = df[bact_label[1]]

    print(len(bact_label[0]))
    pca_and_conf_matrix_per_group(df0)
    print(len(bact_label[1]))
    pca_and_conf_matrix_per_group(df1)
Пример #32
0
def suggested_terminals_spectral(graph, terminal_count):
    """Suggests a set of terminal vertices for the given graph.

    The terminals are suggested according to a two-step procedure.
        First, we perform a spectral clustering on the graph with
        terminal_count clusters. Then, within each cluster, we suggest
        the vertex which has the highest degree.

    Args:
        graph: the graph in which to suggest the terminals.
        terminal_count: the number of terminals to suggest.

    Returns:
        terminals: the suggested terminal vertices in the graph.
        total_degree: total degree of the terminal vertices in the graph.
    """
    adj_matrix = nx.to_numpy_matrix(graph)

    sc = SpectralClustering(n_clusters=terminal_count, affinity="precomputed")
    sc.fit(adj_matrix)

    deg = graph.degree()

    terminals = []
    total_degree = 0

    for c in range(terminal_count):
        restricted_nodes = [(degree, node) for node, degree in deg
                            if sc.labels_[list(graph).index(node)] == c]
        maximizer = max(restricted_nodes)
        total_degree += maximizer[0]
        terminals.append(maximizer[1])

    return terminals, total_degree
Пример #33
0
def do_clustering(cluster_num, mrna_corr_mat, mirna_corr_mat, mrna_corr_weight,
                  sample_id_list):
    mrna_distance_mat = 1 - mrna_corr_mat
    mrna_normal_mat = calculate_corr_mat(mrna_distance_mat)
    mirna_distance_mat = 1 - mirna_corr_mat
    mirna_normal_mat = calculate_corr_mat(mirna_distance_mat)

    a = mrna_corr_weight
    normal_mat = a * mrna_normal_mat + (1 - a) * mirna_normal_mat

    cluster = SpectralClustering(n_clusters=cluster_num,
                                 affinity='precomputed',
                                 n_init=100)
    cluster.fit(normal_mat)
    predict_label = cluster.labels_

    sample_id_col = np.array(["SampleID"])
    sample_id_col = np.hstack((sample_id_col, sample_id_list))
    clustering_result = sample_id_col.reshape(-1, 1)
    label_col = np.array(["Label"])
    predict_label.astype(str)
    label_col = np.hstack((label_col, predict_label))
    label_col = label_col.reshape(-1, 1)
    clustering_result = np.hstack((clustering_result, label_col))
    return normal_mat, clustering_result
Пример #34
0
 def run(self, features, number_of_clusters=2, restarts=10, delta=3.0):
     if number_of_clusters == 1:
         result = numpy.zeros(len(features), dtype=numpy.int32)
         return [result]
     classifier = SpectralClustering(k=number_of_clusters, n_init=restarts)
     similarity = get_similarity(features, delta)
     classifier.fit(similarity)
     return [classifier.labels_]
Пример #35
0
def test_affinities():
    X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]], cluster_std=0.4)
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)
def run_clustering(methods, cases):
    true_method_groups = [m[1] for m in methods]
    edge_model = GraphLassoCV(alphas=4, n_refinements=5, n_jobs=3, max_iter=100)
    edge_model.fit(cases)
    CV = edge_model.covariance_
    
    num_clusters=3
    spectral = SpectralClustering(n_clusters=num_clusters,affinity='precomputed') 
    spectral.fit(np.asarray(CV))
    spec_sort=np.argsort(spectral.labels_)
    
    for i,m in enumerate(methods):
        print "%s:%d\t%s"%(m[1],spectral.labels_[i],m[0])
    print "Adj. Rand Score: %f"%adjusted_rand_score(spectral.labels_,true_method_groups)
Пример #37
0
def eval_k(max_k):
    a_score, idx = [], []
    for k in xrange(2, max_k + 1):
        print 'k={}'.format(k)
        est = SpectralClustering(n_clusters=k, affinity='nearest_neighbors')
#         est = SpectralClustering(n_clusters=k, affinity='rbf', gamma=0.00001)
        est.fit(x)
        ari = metrics.adjusted_rand_score(y, est.labels_)
        print ari
        a_score.append(ari)
        idx.append(k)
    pl.plot(idx, a_score)
    pl.xlabel('# of clusters')
    pl.ylabel('ARI')
    pl.show()
Пример #38
0
def spectral(X, num_clusters):
    """
    Spectral Clustering on X for response y
    Returns array of cluster groups
    """
    model = SpectralClustering(
        n_clusters=num_clusters,
        eigen_solver="arpack",
        affinity="nearest_neighbors",
        n_neighbors=4,
        assign_labels="discretize",
    )
    cleanX = preprocessing.scale(X.as_matrix())
    model.fit(cleanX)
    return model.labels_
def spectral(x, num_clusters):
  spec = SpectralClustering(
    affinity='rbf', # 'rbf'
    n_clusters=num_clusters,
    n_init=10,
    assign_labels='kmeans', 
    gamma=1.0, 
    degree=3, 
    coef0=1
  )
  spec.fit(x)

  c = spec.labels_
  k = len(np.unique(c))

  return spec, (None, c, k)
Пример #40
0
def test_affinities():
    X, y = make_blobs(n_samples=40, random_state=1, centers=[[1, 1], [-1, -1]],
                      cluster_std=0.4)
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                            random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    # raise error on unknown affinity
    sp = SpectralClustering(n_clusters=2, affinity='<unknown>')
    assert_raises(ValueError, sp.fit, X)
Пример #41
0
def test_affinities():
    # Note: in the following, random_state has been selected to have
    # a dataset that yields a stable eigen decomposition both when built
    # on OSX and Linux
    X, y = make_blobs(n_samples=40, random_state=2, centers=[[1, 1], [-1, -1]], cluster_std=0.4)
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity="nearest_neighbors", random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    # raise error on unknown affinity
    sp = SpectralClustering(n_clusters=2, affinity="<unknown>")
    assert_raises(ValueError, sp.fit, X)
Пример #42
0
def test_affinities():
    # Note: in the following, random_state has been selected to have
    # a dataset that yields a stable eigen decomposition both when built
    # on OSX and Linux
    X, y = make_blobs(n_samples=20, random_state=0,
                      centers=[[1, 1], [-1, -1]], cluster_std=0.01
                      )
    # nearest neighbors affinity
    sp = SpectralClustering(n_clusters=2, affinity='nearest_neighbors',
                            random_state=0)
    assert_warns_message(UserWarning, 'not fully connected', sp.fit, X)
    assert_equal(adjusted_rand_score(y, sp.labels_), 1)

    sp = SpectralClustering(n_clusters=2, gamma=2, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal(adjusted_rand_score(y, labels), 1)

    X = check_random_state(10).rand(10, 5) * 10

    kernels_available = kernel_metrics()
    for kern in kernels_available:
        # Additive chi^2 gives a negative similarity matrix which
        # doesn't make sense for spectral clustering
        if kern != 'additive_chi2':
            sp = SpectralClustering(n_clusters=2, affinity=kern,
                                    random_state=0)
            labels = sp.fit(X).labels_
            assert_equal((X.shape[0],), labels.shape)

    sp = SpectralClustering(n_clusters=2, affinity=lambda x, y: 1,
                            random_state=0)
    labels = sp.fit(X).labels_
    assert_equal((X.shape[0],), labels.shape)

    def histogram(x, y, **kwargs):
        """Histogram kernel implemented as a callable."""
        assert_equal(kwargs, {})    # no kernel_params that we didn't ask for
        return np.minimum(x, y).sum()

    sp = SpectralClustering(n_clusters=2, affinity=histogram, random_state=0)
    labels = sp.fit(X).labels_
    assert_equal((X.shape[0],), labels.shape)

    # raise error on unknown affinity
    sp = SpectralClustering(n_clusters=2, affinity='<unknown>')
    assert_raises(ValueError, sp.fit, X)
Пример #43
0
    def doClustering(self):
        photos = self.getClusteringData()
        
        features = []

        for p in photos:
            features.append( list(self.getCoordinates(p)))
        #km = KMeans(n_clusters = 10, init='k-means++', max_iter=100)
        #km.fit(features) 
        
        #algo = MeanShift()
        algo = SpectralClustering(4)
        algo.fit(np.asarray(features))

        f = file(self.file_name_prefix+'evening_msp_meanshift.csv', 'w')

        for idx in range(len(photos)):
            p = photos[idx]
            f.write( (str(p['location']['latitude'])+','+str(p['location']['longitude'])+','+str(algo.labels_[idx])+p['images']['standard_resolution']['url']+'\n' ))
def initializeW_clustering(n,relationFileName, nClusters):
    W = np.identity(n+1)
    with open(relationFileName) as f:
        f.readline()
        for line in f:
            line = line.split('\t')            
            if int(line[0])<=n and int(line[1]) <=n:
                W[int(line[0])][int(line[1])] +=1   
    #KMeans
    '''
    kmeans = KMeans(n_clusters=nClusters)
    kmeans.fit(W)
    label = kmeans.labels_
    '''
    
    #SpectralClustering
    #spc = SpectralClustering(n_clusters=nClusters, affinity = "precomputed")
    spc = SpectralClustering(n_clusters=nClusters)
    spc.fit(W)   # What is the meaning
    label = spc.labels_
    

    with open(relationFileName+'.cluster','w') as f:
        for i in range(n):
            f.write(str(label[i])+'\n')
        
    NeighborW = np.zeros(shape=(nClusters, nClusters))
    for i in range(n):
        for j in range(n):
            if label[i]==label[j]:
                NeighborW[label[i]][label[j]] = 0
            else:
                NeighborW[label[i]][label[j]] += W[i][j]
    NormalizedNeighborW = normalizeByRow(NeighborW)

    newW = np.identity(nClusters) + NormalizedNeighborW   
    print 'newW', newW  

    NormalizednewW = normalizeByRow(newW)   
    print 'NormalizednewW', NormalizednewW.T

    return NormalizednewW.T, newW, label
Пример #45
0
def rbf(max_k):
    gamma_set = [math.pow(10, i) for i in xrange(-5, 1)]
    a_score, idx = [[] for i in xrange(len(gamma_set))], []
    for k in xrange(2, max_k + 1):
        print 'k={}'.format(k)
        for i, gamma in enumerate(gamma_set):
            est = SpectralClustering(n_clusters=k, affinity='rbf', gamma=gamma)
            est.fit(x)
            ari = metrics.adjusted_rand_score(y, est.labels_)
            a_score[i].append(ari)
        idx.append(k)
    for i in xrange(len(gamma_set)):
        print gamma_set[i]
        print np.max(a_score[i])
        pl.plot(idx, a_score[i], label='gamma={}'.format(gamma_set[i]))
        
    pl.legend(loc=4,prop={'size':12})
    pl.xlabel('# of clusters')
    pl.ylabel('ARI')
    pl.show()
Пример #46
0
def main():
	percentageDensityDistance = 0.35
	
	data = []
	with open('/home/casep/Dropbox/Docencia/UTFSM/MsC/Tesis/Data/segmentation.data', 'rb') as csvfile:
		visionData = csv.reader(csvfile, delimiter=',', quotechar='"')
		for row in visionData:
			if len(row) > 12:
				dataRow = []
				dataRow.extend([row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12],row[13],row[14],row[15],row[16],row[17],row[18]])
				data.append(dataRow)
	
	clusterData = np.array(data)[1:,:]
	
	clustersNumber, labels = dp.predict(clusterData, percentageDensityDistance)
	print 'clustersNumber',clustersNumber
	print 'fit DensityPeaks',metrics.silhouette_score(clusterData, labels, metric='euclidean')
	
	clustersNumber = 5
	
	km = KMeans(init='k-means++', n_clusters=clustersNumber, n_init=10,n_jobs=-1)
	km.fit(clusterData)
	labels = km.labels_
	print 'fit K-Means',metrics.silhouette_score(clusterData, labels, metric='euclidean')
	
	sc = SpectralClustering(n_clusters=clustersNumber, eigen_solver=None, \
			random_state=None,  n_init=10, gamma=1.0, affinity='nearest_neighbors', \
			n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, \
			coef0=1, kernel_params=None)
	sc.fit(clusterData)
	labels = sc.labels_
	print 'fit Spectral',metrics.silhouette_score(clusterData, labels, metric='euclidean')

	clusterData=np.array(clusterData,dtype=float)
	gmix = mixture.GMM(n_components=clustersNumber, covariance_type='spherical')
	gmix.fit(clusterData)
	labels = gmix.predict(clusterData)
	print 'fit GMM',metrics.silhouette_score(clusterData, labels, metric='euclidean')
	
	
	return 0
Пример #47
0
    def _fit_spectral(self, x):
        # FIXME: broken still
        D = euclidean_distances(x, x)
        A = HomoscedasticClusteringNode.gauss_heat_kernel(D)
        # clustering
        for c in xrange(len(self.crange)):
            k = self.crange[c]
            for r in xrange(self.repeats):
                # init
                if self.debug is True:
                    print '\t[%s][c:%d][r:%d]' % (
                        self.clus_type, self.crange[c], r + 1),
                idx = c * self.repeats + r

                # evaluate model
                model = SpectralClustering(k=k)
                model.fit(A)
                self._labels[idx] = model.labels_
                means = sp.zeros((k, x.shape[1]))
                for i in xrange(k):
                    means[i] = x[model.labels_ == i].mean(0)
                self._parameters[idx] = means
def test_n_components():
    # Test that after adding n_components, result is different and
    # n_components = n_clusters by default
    X, y = make_blobs(n_samples=20, random_state=0,
                      centers=[[1, 1], [-1, -1]], cluster_std=0.01)
    sp = SpectralClustering(n_clusters=2, random_state=0)
    labels = sp.fit(X).labels_
    # set n_components = n_cluster and test if result is the same
    labels_same_ncomp = SpectralClustering(n_clusters=2, n_components=2,
                                           random_state=0).fit(X).labels_
    # test that n_components=n_clusters by default
    assert_array_equal(labels, labels_same_ncomp)

    # test that n_components affect result
    # n_clusters=8 by default, and set n_components=2
    labels_diff_ncomp = SpectralClustering(n_components=2,
                                           random_state=0).fit(X).labels_
    assert not np.array_equal(labels, labels_diff_ncomp)
Пример #49
0
def get_label_res(similar_matrix, n_subs):

    # cluster = AffinityPropagation(damping = 0.75)# , affinity = 'precomputed') # preference = -1000)# n_clusters = n_subs, affinity = 'precomputed')

    if True:
        labels = spectral_clustering(lil_matrix(similar_matrix), n_clusters = n_subs, eigen_solver='arpack') # affinity = 'precomputed',
        return labels
    elif False:
        cluster = SpectralClustering(n_clusters = n_subs, affinity = 'precomputed', eigen_solver='arpack')
    else:
        cluster = SpectralClustering(n_clusters = n_subs, affinity = 'nearest_neighbors', eigen_solver='arpack')

    res = cluster.fit(similar_matrix)

    size_labels = len(set(res.labels_))
    assert size_labels < 10, size_labels
    assert size_labels > 1, size_labels

    print res.labels_
    return res.labels_
Пример #50
0
def MultiDimensionalClusteringSPCL(Xmatrix, time, xdata, eigen_solver = 'arpack', n_clusters=2, ax = None, show=False):

	seed = np.random.seed(0)
	colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
	colors = np.hstack([colors] * 20)

	# normalize dataset for easier parameter selection
	X = StandardScaler().fit_transform(Xmatrix)
	# algorithm SpectralClustering
	SC = SpectralClustering(n_clusters=n_clusters, eigen_solver=eigen_solver, affinity="nearest_neighbors")

	# Apply algorithm
	fit = SC.fit(X)

	y_pred = fit.labels_.astype(np.int)

	# Representation
	if np.logical_and(show, ax == None):

		ax.set_title('Clustering Tech: ' + "SpectralClustering; " + 'Number of Clusters = ' + str(n_clusters), fontsize=15)
		ax.plot(time, xdata, color='lightgray', alpha=0.4)
		ax.scatter(time, xdata, color=colors[y_pred].tolist(), s=10)
		ax.set_xlabel("time (ms)")
		ax.set_ylabel("Amplitude")

		return X, y_pred

	elif np.logical_and(show, ax == None):

		fig, axis = plt.subplots(1, 1)
		fig.tight_layout()
		axis.plot(time, xdata, color='lightgray', alpha=0.4)
		axis.scatter(time, xdata, color=colors[y_pred].tolist(), s=10)
		axis.set_xlabel("time (ms)")
		axis.set_ylabel("Amplitude")

		return X, y_pred

	else:
		return X, y_pred
Пример #51
0
    sys.exit('Usage: python spectral.py dataset k')

## Data preprocessing
data = parse_tab(sys.argv[1])
k = int(sys.argv[2])
classes = [example[-1] for example in data]

examples = data_to_na(data)
distances = euclidean_distances(examples, examples)
# Apply gaussian kernel as suggested in the documentation:
gamma = 0.5 # == 1 / num_features (heuristic)
similarity_matrix = numpy.exp(-distances * gamma)

## Clustering
sc = SpectralClustering(k=k, random_state=0)
sc.fit(similarity_matrix)
labels = sc.labels_

## Performance evaluation
ari = adjusted_rand_score(labels, classes)
homogeneity, completeness, v_measure = homogeneity_completeness_v_measure(labels, classes)
print('ARI: {0}'.format(ari))
print('Homogeneity: {0}'.format(homogeneity))
print('Completeness: {0}'.format(completeness))
print('V-measure: {0}'.format(v_measure))
addToResult('Spectral', ari, homogeneity, completeness, v_measure)

draw.scatter(examples, labels)
print(os.path.splitext(os.path.basename(sys.argv[1]))[0])
draw.setImgTitle('spectral_' + os.path.splitext(os.path.basename(sys.argv[1]))[0])
draw.showImage()
Пример #52
0
    def cluster_reproducibility(self, repeats=None, clusters=50):
        
        """ Given the tag co-occurence arrays generated by the train
        method, use the spectral clustering method in sklearn and the
        known (or desired) number of clusters to assign tags to
        specific clusters.
        
        Required input:
            None
            
        Optional input:
            repeats - a set of co-occurence arrays to cluster using
                spectral methods. If not supplied, this method
                defaults to self.repeats which is the data generated
                by the train() method.
                
            labels - the tags corresponding to the feature vectors.
                Labels must be correctly ordered, obviously.
                
        Returns:
        
            None ----BUT---- generates the following analysis in the
            self namespace.'
            
            1. self.reproduction_matrices: a reorganization of the
                repeats data into block diagonal form.
                
            2. self.reproduction_analysis: a list of dictionaries.
                Each dictionary has two keys: 'members' and 'sizes'.
                
                'members' lists the tag membership of each cluster
                in terms of the indices of the feature vectors represented
                by samples in train(),arranged by size.
                
                'sizes' gives the size of each
                cluster. The index of the self.reproduction_analysis
                list gives the number of clusters remainging from
                the agglomeration. For example,
                
                self.reproduction_analysis[10][4]['members'] lists the
                tag indices of the 5th largest cluster when there are
                11 clusters remaining from the agglomeration.
                
        
        """
    
        def _find(where, what):
            """ Helper """
            return np.where(where == what[0])[0].tolist()
    
        from sklearn.cluster import SpectralClustering
        from collections import Counter

        if repeats == None:
            repeats = self.repeats
        spectral = SpectralClustering(n_clusters=1, affinity="precomputed")

        cluster = 0
        
        shape = (clusters,)+repeats.shape[1:]
        self.reproduction_matrices = np.zeros(shape, np.uint8)
        self.reproduction_analysis = []

        for idx, repeat in enumerate(repeats[:clusters]):

            # run the spectral clustering on the current repeat array.
            # this is the rate limiting step, and already uses all
            # available cpu cores.
            spectral.set_params(n_clusters=idx+1)
            spectral.fit(repeat)
            labels = spectral.labels_

            # order the clusters by size. keys in members are strings
            # as required for json dumps
            count = Counter(spectral.labels_)
            by_size = [(k, v) for k, v in count.items()]
            by_size.sort(key=lambda x: -x[1])
            members = {str(t[0]+cluster):_find(labels, t) for t in by_size}
            order = np.hstack([members[str(t[0]+cluster)] for t in by_size])

            #rearrange
            rearr = repeat[order].transpose()[order]
            sizes = [[str(k), len(v)] for k, v in members.items()]
            sizes.sort(key=lambda x: -x[1])
            
            # m gives the counts for each pair of tags. 3d array.
            # shape: [nclusters-1,ntags,ntags]. members are the tag
            # indices; self.graph.graph.nodes()[members] gives members as words.
            # sizes are the number of tags in each cluster, sorted by size
            tmp = {'members':members, 'sizes':sizes}
            
            rescale = (rearr*255./rearr.max()).astype(np.uint8)
            self.reproduction_matrices[idx] = rescale
            self.reproduction_analysis.append(tmp)
            cluster += idx+1
Пример #53
0
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
	plt.title('The Workhorse Bus Stops of Pasadena ARTS \n - Spectral Clustering in Two Dimensions -' )
	plt.xlabel("Average Delay in minutes")
	plt.ylabel("Logarithm of total passenger count")
	plt.savefig('station.png')
	
	print("Valuable Bus Stops: \n")
	print(stationFrame[stationFrame['predictedClass'] == 1])
	
	
	# pca visualization, not as sexy as one above
	pcaDecomp =  PCA(n_components=2)
	reduced_data = pcaDecomp.fit_transform(stationFrame)
	spectral.fit(reduced_data)
	print(reduced_data)
	h = 0.3
	x_min, x_max = reduced_data[:,0].min() - 1, reduced_data[:,0].max() + 1
	y_min, y_max = reduced_data[:,1].min() - 1, reduced_data[:,1].max() + 1
	xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
	Z = spectral.fit_predict(np.c_[xx.ravel(), yy.ravel()])
	Z = Z.reshape(xx.shape)
	fig1 = plt.figure()
	plt.imshow(Z, interpolation='nearest',extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.Paired, aspect='auto', origin='lower')
	plt.plot(reduced_data[:,0], reduced_data[:,1], 'k.', markersize=8)
	plt.title('cluster')
	plt.xlim(x_min, x_max)
	plt.ylim(y_min, y_max)
	plt.xticks(())
	plt.yticks(())
Пример #54
0
def main():
	
	parser = argparse.ArgumentParser(prog='clusteringTime8.py',
	 description='Performs clustering, Gaussian Mixture, KMeans or Spectral',
	 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('--sourceFolder',
	 help='Source folder',
	 type=str, required=True)
	parser.add_argument('--outputFolder',
	 help='Output folder',
	 type=str, required=True)
	parser.add_argument('--clustersNumber',
	 help='Number of clusters',
	 type=int, default='3', choices=[2,3,4,5,6,7,8,9,10,11,12,13,14,15], required=False)
	parser.add_argument('--framesNumber',
	 help='Number of frames used in STA analysis',
	 type=int, default='20', required=False)
	parser.add_argument('--blockSize',
	 help='Size of each block in micrometres',
	 type=int, default='50', required=False)
	parser.add_argument('--clusteringAlgorithm',
	 help='Clustering algorithm to use: K-Means, Spectral Clustering, GMM',
	 type=str, default='kmeans', choices=['kmeans','spectral','gmm','densityPeaks'], required=False)
	parser.add_argument('--percentageDensityDistance',
	 help='Percentage used to calculate the distance',
	 type=float, default='2', required=False)
	 
	args = parser.parse_args()

	#Source folder of the files with the timestamps
	sourceFolder = rfe.fixPath(args.sourceFolder)
	if not os.path.exists(sourceFolder):
		print ''
		print 'Source folder does not exists ' + sourceFolder
		print ''
		sys.exit()

	#Output folder for the graphics
	outputFolder = rfe.fixPath(args.outputFolder)
	if not os.path.exists(outputFolder):
		try:
			os.makedirs(outputFolder)
		except:
			print ''
			print 'Unable to create folder ' + outputFolder
			print ''
			sys.exit()
	
	#Clusters number for the kmeans algorithm
	clustersNumber = args.clustersNumber

	#Frames used in STA analysis
	framesNumber = args.framesNumber
	
	#Size of each block in micrometres
	blockSize = args.blockSize
	
	#Clustering Algorithm
	clusteringAlgorithm = args.clusteringAlgorithm
	
	#dataCluster stores the data to be used for the clustering process
	#the size is equal to the number of frames, aka, the time component
	#plus 5 as we are incorporating the 2 dimensions of the ellipse, 
	#x position, y position and angle
	dataCluster = zeros((1,framesNumber+7))
	units = []
	dato = empty((1,1))
	for unitFile in os.listdir(sourceFolder):
		if os.path.isdir(sourceFolder+unitFile):	
			dato = empty((1,1))		
			unitName = unitFile.rsplit('_', 1)[0]
			#print unitName
			dataUnit, coordinates = rfe.loadSTACurve(sourceFolder,unitFile,unitName)
			xSize = dataUnit.shape[0]
			ySize = dataUnit.shape[1]
			fitResult = rfe.loadFitMatrix(sourceFolder,unitFile)
			dataUnitTemporal = dataUnit[coordinates[0][0],[coordinates[1][0]],:]
			#Time data from FITResult
			#dataUnitTemporal = rfe.loadVectorAmp(sourceFolder,unitFile).T
			#A radius of the RF ellipse
			aRadius = fitResult[0][2]
			dato[0] = aRadius
			dataUnitCompleta = concatenate((dataUnitTemporal,dato),1)
			#B radius of the RF ellipse
			bRadius = fitResult[0][3]
			dato[0] = bRadius
			dataUnitCompleta = concatenate((dataUnitCompleta,dato),1)
			#angle of the RF ellipse
			angle = fitResult[0][1]
			dato[0] = angle
			dataUnitCompleta = concatenate((dataUnitCompleta,dato),1)
			#X coordinate of the RF ellipse
			xCoordinate = fitResult[0][4]
			#print 'xCoordinate',xCoordinate
			dato[0] = xCoordinate
			dataUnitCompleta = concatenate((dataUnitCompleta,dato),1)
			#Y coordinate of the RF ellipse
			yCoordinate = fitResult[0][5]
			#print 'yCoordinate',yCoordinate
			dato[0] = yCoordinate
			dataUnitCompleta = concatenate((dataUnitCompleta,dato),1)
			#Area of the RF ellipse
			area = aRadius*bRadius*pi
			dato[0] = area
			dataUnitCompleta = concatenate((dataUnitCompleta,dato),1)
			#UnitName
			dato=empty(1, dtype='|S16')
			dato[0]=unitName
			dataUnitCompleta = concatenate((dataUnitCompleta,dato.reshape(1, 1)),1)
			
			dataCluster = append(dataCluster,dataUnitCompleta, axis=0)
			
			units.append(unitName)
	# remove the first row of zeroes
	dataCluster = dataCluster[1:,:]	
		
	#Solo temporal dataCluster[:,0:framesNumber]
	# framesNumber
	data = dataCluster[:,framesNumber*.45:framesNumber*.9]
	data = data.astype(float64, copy=False)
	
	# Calculates the next 5-step for the y-coordinate
	maxData =  ceil(amax(data)/5)*5
	minData = floor(amin(data)/5)*5

	if clusteringAlgorithm == 'spectral':
		from sklearn.cluster import SpectralClustering
		sc = SpectralClustering(n_clusters=clustersNumber, eigen_solver=None, \
				random_state=None,  n_init=10, gamma=1.0, affinity='nearest_neighbors', \
				n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, \
				coef0=1, kernel_params=None)
		sc.fit(data)
		labels = sc.labels_
	elif clusteringAlgorithm == 'gmm':
		from sklearn import mixture
		gmix = mixture.GMM(n_components=clustersNumber, covariance_type='spherical')
		gmix.fit(data)
		labels = gmix.predict(data)
	elif clusteringAlgorithm == 'densityPeaks':
		import densityPeaks as dp
		percentageDensityDistance = args.percentageDensityDistance
		clustersNumber, labels = dp.predict(data, percentageDensityDistance)
	else:
		from sklearn.cluster import KMeans
		km = KMeans(init='k-means++', n_clusters=clustersNumber, n_init=10,n_jobs=-1)
		km.fit(data)
		labels = km.labels_
	
	dataFile = empty((1,framesNumber+9),dtype='|S16')
	datos = empty((1,framesNumber+7),dtype='|S16')
	dato = empty((1,1),dtype='|S16')
	for clusterId in range(clustersNumber):
		for unitId in range(dataCluster.shape[0]):
			if labels[unitId] == clusterId:			
				dato[0] = clusterId
				dataFileTmp = concatenate(([dataCluster[unitId,:]],dato),1)
				x = linspace(1, framesNumber, framesNumber)
				s = UnivariateSpline(x, dataCluster[unitId,0:framesNumber], s=0)
				xs = linspace(1, framesNumber, framesNumber*1000)
				ys = s(xs)
				
				media = mean(ys)
				maximo = amax(ys)
				minimo = amin(ys)
				maximaDistancia = absolute(maximo-media)
				minimaDistancia = absolute(minimo-media) 
				peakTempCurve = minimo
				if maximaDistancia > minimaDistancia:
					peakTempCurve = maximo
				dato[0] = unique(where(peakTempCurve==ys)[0])[0]
				dataFileTmp = concatenate((dataFileTmp,dato),1)
				dataFile = append(dataFile, dataFileTmp, axis=0)
		
	# remove the first row of zeroes
	dataFile = dataFile[1:,:]
	savetxt(outputFolder+'outputFile.csv',dataFile, fmt='%s', delimiter=',', newline='\n')
		
	return 0
Пример #55
0
#line = aline[0][0]
for line, _, _ in aline:
    print 'LINE=', line
    print datapath + '/WHOLE/trazos.' + line + '.mat'
    matpeaks = scipy.io.loadmat(datapath + '/WHOLE/trazos.' + line + '.mat')
    print matpeaks['Trazos'].shape
    data = matpeaks['Trazos']
    normalize(data)

    if alg == 'spectral':
        spectral = SpectralClustering(n_clusters=nc, assign_labels='discretize',
                                      affinity='nearest_neighbors', n_neighbors=30)
    elif alg == 'kmeans':
        spectral = KMeans(n_clusters=nc, n_jobs=-1)

    spectral.fit(data)

    lab = spectral.labels_

    centers = np.zeros((nc, data.shape[1]))

    for i in range(data.shape[0]):
        centers[lab[i]] += data[i]

    print len(lab)

    l = [lab[i] for i in range(len(lab))]

    c = Counter(l)

    print c
Пример #56
0
#kmeans
km = KMeans(n_clusters = CLNO)
km_fit = km.fit(dfun)
km_clusters = km.labels_.tolist()
print no_unique(km_clusters)

#affinity propagation
ap = AffinityPropagation()
ap_fit = ap.fit(dfun)
ap_clusters = ap.labels_.tolist()
print no_unique(ap_clusters)

#spectral clustering
sc = SpectralClustering(CLNO)
sc_fit = sc.fit(dfun)
sc_clusters = sc.labels_.tolist()
print "spectral", no_unique(sc_clusters)

#ward

ac = AgglomerativeClustering(CLNO, connectivity = conn, linkage = 'ward')
ac_fit = ac.fit(dfun)
ac_clusters = ac.labels_.tolist()
print no_unique(ac_clusters)

#output pd
data = {"km":km_clusters, "ap":ap_clusters, "sc":sc_clusters, "ac":ac_clusters}
df_cl = pd.DataFrame(data = data)
df_cl.to_csv("clusters.csv")
Пример #57
0
""" Preprocessing """
import mypreprocessing as prp
data = prp.RowWiseNorm(data)

silh = []
comp = []
h**o = []
vmea = []
from sklearn.cluster import SpectralClustering
from sklearn import metrics

for k in range(2, 11):
    print k
    km = SpectralClustering(n_clusters=k)
    km.fit(data)
    silh.append(metrics.silhouette_score(data, km.labels_))
    comp.append(metrics.completeness_score(projects_true, km.labels_))
    h**o.append(metrics.homogeneity_score(projects_true, km.labels_))
    vmea.append(metrics.v_measure_score(projects_true, km.labels_))

plt.style.use('fivethirtyeight')
plt.plot(range(2,11), silh)
plt.plot(range(2,11), comp)
plt.plot(range(2,11), h**o)
plt.plot(range(2,11), vmea)
plt.title('Spectral clustering, Row-wise Normalization')
plt.xlabel('k clusters')
plt.ylabel('Silhouette Coefficient')
plt.legend(['silhouette', 'completeness', 'homogeneity', 'v-measure'], loc='upper right')
plt.show()
Пример #58
0
def main():
	
	parser = argparse.ArgumentParser(prog='kmeans_scikit.py',
	 description='Performs K-means using scikit-learn',
	 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('--sourceFolder',
	 help='Source folder',
	 type=str, required=True)
	parser.add_argument('--outputFolder',
	 help='Output folder',
	 type=str, required=True)
	parser.add_argument('--clustersNumber',
	 help='Number of clusters',
	 type=int, default='5', choices=[3,4,5,6,7,8,9,10,11,12,13,14,15], required=False)
	parser.add_argument('--framesNumber',
	 help='Number of frames used in STA analysis',
	 type=int, default='20', required=False)
	parser.add_argument('--pcaComponents',
	 help='Number of components for PCA',
	 type=int, default='4', required=False)
	parser.add_argument('--doPCA',
	 help='Performs clusterings with PCA or not',
	 type=bool, default=False, required=False)

	args = parser.parse_args()

	#Source folder of the files with the timestamps
	sourceFolder = rfe.fixPath(args.sourceFolder)
	if not os.path.exists(sourceFolder):
		print ''
		print 'Source folder does not exists ' + sourceFolder
		sys.exit()

	#Output folder for the graphics
	outputFolder = rfe.fixPath(args.outputFolder)
	if not os.path.exists(outputFolder):
		try:
			os.makedirs(outputFolder)
		except:
			print ''
			print 'Unable to create folder ' + outputFolder
			sys.exit()
	
	#Clusters number for the kmeans algorithm
	clustersNumber = args.clustersNumber

	#Frames used in STA analysis
	framesNumber = args.framesNumber
	
	#dataCluster stores the data to be used for the clustering process
	#the size is equal to the number of frames, aka, the time component
	#plus 5 as we are incorporating the 2 dimensions of the ellipse,
	#x position, y position and angle
	dataCluster = np.zeros((1,framesNumber+5))
	units=[]
	dato=np.zeros((1,1))
	for unitFile in os.listdir(sourceFolder):
		if os.path.isdir(sourceFolder+unitFile):			
			unitName = unitFile.rsplit('_', 1)[0]
			dataUnit, coordinates = rfe.loadSTACurve(sourceFolder,unitFile,unitName)
			xSize = dataUnit.shape[0]
			ySize = dataUnit.shape[1]
			fitResult = rfe.loadFitMatrix(sourceFolder,unitFile)
			#should we use the not-gaussian-fitted data for clustering?
			dataUnitGauss = scipy.ndimage.gaussian_filter(dataUnit[coordinates[0][0],[coordinates[1][0]],:],2)
			#A radius of the RF ellipse
			dato[0]=fitResult[0][2]
			dataUnitCompleta = np.concatenate((dataUnitGauss,dato),1)
			#B radius of the RF ellipse
			dato[0]=fitResult[0][3]
			dataUnitCompleta = np.concatenate((dataUnitCompleta,dato),1)
			#angle of the RF ellipse
			dato[0]=fitResult[0][1]
			dataUnitCompleta = np.concatenate((dataUnitCompleta,dato),1)
			#X coordinate of the RF ellipse
			dato[0]=fitResult[0][4]
			dataUnitCompleta = np.concatenate((dataUnitCompleta,dato),1)
			#Y coordinate of the RF ellipse
			dato[0]=fitResult[0][5]
			dataUnitCompleta = np.concatenate((dataUnitCompleta,dato),1)
			dataCluster = np.append(dataCluster,dataUnitCompleta, axis=0)
			units.append(unitName)
	# remove the first row of zeroes
	dataCluster = dataCluster[1:,:]	
	
	data = dataCluster[:,0:framesNumber+2]	
	sc = SpectralClustering(n_clusters=clustersNumber, eigen_solver=None, random_state=None,  n_init=10, gamma=1.0, affinity='nearest_neighbors', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None)
	sc.fit(data)
	labels = sc.labels_
	fit = metrics.silhouette_score(data, labels, metric='euclidean')
	rfe.graficaCluster(labels, dataCluster[:,0:framesNumber-1], outputFolder+'no_pca.png',clustersColours, fit)
	

	# generate graphics of all ellipses
	for clusterId in range(clustersNumber):
		dataGrilla = np.zeros((1,framesNumber+5))
		for unitId in range(dataCluster.shape[0]):
			if labels[unitId] == clusterId:
				datos=np.zeros((1,framesNumber+5))
				datos[0]=dataCluster[unitId,:]
				dataGrilla = np.append(dataGrilla,datos, axis=0)
		# remove the first row of zeroes
		dataGrilla = dataGrilla[1:,:]
		rfe.graficaGrilla(dataGrilla,outputFolder+'Grilla_'+str(clusterId)+'.png',clustersColours[clusterId],framesNumber,xSize,ySize)
		rfe.graficaCluster(labels, dataGrilla[:,0:framesNumber-1], outputFolder+'cluster_'+str(clusterId)+'.png',clustersColours[clusterId])
	
	rfe.guardaClustersIDs(outputFolder,units,labels,outputFolder+'clustering_no_pca.csv')
	
	if args.doPCA:
		pca = PCA(n_components=args.pcaComponents)
		newData = pca.fit_transform(data)
		sc.fit(newData)
		fit = metrics.silhouette_score(newData, labels, metric='euclidean')
		rfe.graficaCluster(labels, dataCluster[:,0:framesNumber-1], outputFolder+'pca.png',clustersColours,fit)	
		rfe.guardaClustersIDs(outputFolder,units,labels,outputFolder+'clustering_pca.csv')
	
	return 0
Пример #59
0
if __name__ == "__main__":

    players = {}
    data = []
    names = []
    data_file = open("kda_200.txt", "r")

    # Build data from file
    for line in data_file:
        fields = line.split(",")
        data.append([float(fields[1]), float(fields[3]), float(fields[4]), float(fields[4])])
        names.append(fields[0])

    # Create and fit model
    clus = SpectralClustering(n_clusters=5,eigen_solver='arpack',affinity= "nearest_neighbors")
    clus.fit(data)
    labels = clus.fit_predict(data)

    # Sort the fitted data into 5 boxes, one for each role
    boxes = [[],[],[],[],[]]
    for x in range(len(data)):
        pred = labels[x]
        name = names[x]
        # names like "Amazing (Maurice Stuckenschneider)" are too long, cut at first space
        if " " in name:
            name = name[0:name.find(" ")+1]
        
        boxes[pred].append(name.ljust(10))

    # Get size of largest cluster so you can pad the others
    sizes = [len(boxes[0]), len(boxes[1]), len(boxes[2]), len(boxes[3]), len(boxes[4])]