Пример #1
0
Файл: ooc.py Проект: audy/bfc
def main():
    '''
        >>> main() # stuff happens
    '''

    args = parse_args()
    setup_logging(args.log, verbose=args.verbose)

    chunks = sequence_chunk_generator(args.fasta_file,
                                      chunk_size=args.chunk_size)

    hasher = HashingVectorizer(analyzer='char',
                               n_features = 2 ** 18,
                               ngram_range=(args.ngram_min, args.ngram_max),
                               )

    estimator = AffinityPropagation()

    for chunk in chunks:

        logging.info('hashing chunk')
        chunk_vector = hasher.transform([ str(i.seq) for i in chunk ])

        logging.info('clustering')

        estimator.fit(chunk_vector)

        logging.info('got %s clusters' % len(set(estimator.labels_)))
def run_affinity_propagation(affinities, preference):
    ap = AffinityPropagation(affinity='precomputed', preference=preference)
    ap.fit(affinities)
    # print(affinities == ap.affinity_matrix_)
    cluster_centers_indices = ap.cluster_centers_indices_
    n_clusters_ = len(cluster_centers_indices)
    return n_clusters_
Пример #3
0
def test_affinity_propagation():
    # Affinity Propagation algorithm
    # Compute similarities
    S = -euclidean_distances(X, squared=True)
    preference = np.median(S) * 10
    # Compute Affinity Propagation
    cluster_centers_indices, labels = affinity_propagation(
        S, preference=preference, random_state=39
    )

    n_clusters_ = len(cluster_centers_indices)

    assert n_clusters == n_clusters_

    af = AffinityPropagation(
        preference=preference, affinity="precomputed", random_state=28
    )
    labels_precomputed = af.fit(S).labels_

    af = AffinityPropagation(preference=preference, verbose=True, random_state=37)
    labels = af.fit(X).labels_

    assert_array_equal(labels, labels_precomputed)

    cluster_centers_indices = af.cluster_centers_indices_

    n_clusters_ = len(cluster_centers_indices)
    assert np.unique(labels).size == n_clusters_
    assert n_clusters == n_clusters_

    # Test also with no copy
    _, labels_no_copy = affinity_propagation(
        S, preference=preference, copy=False, random_state=74
    )
    assert_array_equal(labels, labels_no_copy)
def affinity_propagation(crime_rows, column_names):
    """
        damping : float, optional, default: 0.5
            Damping factor between 0.5 and 1.
        convergence_iter : int, optional, default: 15
            Number of iterations with no change in the number of estimated 
            clusters that stops the convergence.
        max_iter : int, optional, default: 200
            Maximum number of iterations.
        preference : array-like, shape (n_samples,) or float, optional
            Preferences for each point - points with larger values of preferences 
            are more likely to be chosen as exemplars. 
            The number of exemplars, ie of clusters, is influenced by the input 
            preferences value. If the preferences are not passed as arguments, 
            they will be set to the median of the input similarities.
        affinity : string, optional, default=``euclidean``
            Which affinity to use. At the moment precomputed and euclidean are 
            supported. euclidean uses the negative squared euclidean distance 
            between points.
    """
    crime_xy = [crime[0:2] for crime in crime_rows]
    crime_info = [crime[2:] for crime in crime_rows]
    print("Running Affinity Propagation")
    # TODO: Parameterize this
    affinity_prop = AffinityPropagation()
    #affinity_propagation_labels = affinity_prop.fit_predict(crime_xy)
    affinity_prop.fit(random_sampling(crime_xy, num_samples=5000))
    affinity_propagation_labels = affinity_prop.predict(crime_xy)
    print("formatting....")
    return _format_clustering(affinity_propagation_labels, crime_xy, crime_info, 
            column_names)
Пример #5
0
def cluster(scope):
    # Setup data
    df = pd.read_sql('playtype_data', db_engine)

    # Manipulate data into scope
    if scope == 'Team':
        df = df.drop('Player', 1).groupby('Team', as_index=False).mean()
    elif scope == 'Player':
        df = df.drop('Team', 1)
    else:
        raise Exception('This is never supposed to happen')

    # Normalize the data
    df[FEATURES] = (df[FEATURES] - df[FEATURES].mean()) / (df[FEATURES].max() - df[FEATURES].min())

    # Run clustering
    clstr = AffinityPropagation()
    clstr.fit(df[FEATURES])

    # Clump results
    df['cluster'] = clstr.labels_
    df = df.sort('cluster')

    # Convert results to JSON for frontend
    return clusters_to_json(df, scope)
Пример #6
0
 def clusterAffinityPropagation(self):
     """
     Cluster the embeddings with affinity propagation
     :return:
     """
     affin = AffinityPropagation()
     affin.fit(self.emb1.m)
     aflabels1 = affin.labels_
     afclusters1 = dict()
     word2cluster1 = dict()
     for i,l in enumerate(aflabels1):
         points = afclusters1.setdefault(l,list())
         points.append(self.emb1.rd[i])
     for l,c in afclusters1.items():
         for w in c:
             word2cluster1[w] = l
     self.cluster1 = afclusters1
     self.word2cluster1 = word2cluster1
     affin.fit(self.emb2.m)
     aflabels2 = affin.labels_
     afclusters2 = dict()
     word2cluster2 = dict()
     for i,l in enumerate(aflabels2):
         points = afclusters2.setdefault(l,list())
         points.append(self.emb2.rd[i])
     for l,c in afclusters2.items():
         for w in c:
             word2cluster2[w] = l
     self.cluster2 = afclusters2
     self.word2cluster2 = word2cluster2
Пример #7
0
    def saxcluster(self, preference=None, lookup=True):

        cls = AffinityPropagation(preference=preference, affinity='precomputed') if lookup else \
            AffinityPropagation(preference=preference)
        if self.dists is None:
            if lookup:
                data = self.dists = self.__saxDists()
            else:
                data = self.dists = self.avdata.values()
        else:
            data = self.dists
        cls.fit(data)
        reps = self.indexes.keys()
        self.cluster_sax = [reps[i] for i in cls.cluster_centers_indices_]
        self.cluster_centers = [self.avdata[sax] for sax in self.cluster_sax]
        self.clusters = collections.defaultdict(list)
        for ind, label in enumerate(cls.labels_):
            sax = self.cluster_sax[label]
            self.clusters[sax] += self.indexes.values()[ind]
        self.asax_data = dict()
        for sax in self.clusters:
            self.asax_data[sax] = self.data[self.clusters[sax], :].mean(axis=0)
        self.ass = [0] * self.N
        for sax in self.cluster_sax:
            v = self.cluster_sax.index(sax)
            for ind in self.clusters[sax]:
                self.ass[ind] = v
        self.n_clusters = len(self.clusters)
Пример #8
0
def affinity_descriptor(descriptor_list):
    print("Affinity Propagation starting...")
    af = AffinityPropagation()
    af.fit(descriptor_list)
    visual_words = af.cluster_centers_
    print("Visual words are ready.")
    return visual_words
Пример #9
0
	def clustering(self):
		# Calculate similarity matrix
		X = self.create_tfidf_vector()
		X = X.toarray()
		pca = PCA(n_components=300, copy=False)
		X = pca.fit(X).transform(X)
		S = cosine_similarity(X, X)
		# Run affinity propogation
		af = AffinityPropagation()
		af.fit(S)
		# Formulate result
		tmp_clusters = defaultdict(list)
		goal_clusters = defaultdict(list)
		cluster_centers_indices = af.cluster_centers_indices_
		labels = af.labels_
		count = 0
		for label in labels:
			tmp_clusters[\
				self.goal_list[cluster_centers_indices[label]]].append(\
				self.goal_list[count])
			count += 1
		# 2nd-layer clutering of each cluster
		for goal, item_list in tmp_clusters.items():
			subclusters = self.subcluster_by_editdistance(goal, item_list)
			for subgoal, items in subclusters.items():
				goal_clusters[subgoal] = items
		return goal_clusters
Пример #10
0
def make_cluster_map(damping=0.992):
	test_labels, prediction = pickle.load(open(f_path_pred, 'rb'))
	prob_conf = np.zeros((121, 121))
	for l in range(121):
		inds = np.squeeze(np.array(np.where(test_labels == l)))
		class_conf = prediction[inds, :].mean(axis=0)
		prob_conf[l, :] = class_conf
	F = prob_conf
	D = (1-F)
	np.fill_diagonal(D, 0)
	D_p = 0.5*(D+D.T)


	clst = AP(damping=damping, # damping determines # of clusters
			  max_iter=500, 
			  convergence_iter=15, 
			  affinity='euclidean', 
			  verbose=False)
	clst.fit(D_p)
	print 'Number of cluster:', len(clst.cluster_centers_)
	membership = np.c_[range(121), clst.labels_]

	fine_to_coarse = dict(membership)
	coarse_to_fine = {l: [] for l in clst.labels_}
	for k, v in fine_to_coarse.items():
		coarse_to_fine[v].append(k)
		
	pickle.dump(coarse_to_fine, open(os.path.join(curdir, 'coarse_to_fine.p'), 'wb'))
	pickle.dump(fine_to_coarse, open(os.path.join(curdir, 'fine_to_coarse.p'), 'wb'))
Пример #11
0
def affinity():
    # affinity propagation clustering
    from numpy import unique
    from numpy import where
    from sklearn.datasets import make_classification
    from sklearn.cluster import AffinityPropagation
    from matplotlib import pyplot

    # define dataset
    X, _ = make_classification(
        n_samples=1000,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_clusters_per_class=1,
        random_state=4,
    )
    print(X)
    # define the model
    model = AffinityPropagation(damping=0.9)
    # fit the model
    model.fit(X)
    # assign a cluster to each example
    yhat = model.predict(X)
    # retrieve unique clusters
    clusters = unique(yhat)
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # get row indexes for samples with this cluster
        row_ix = where(yhat == cluster)
        # create scatter of these samples
        pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
    # show the plot
    pyplot.show()
def doAffinity(X):
    model = AffinityPropagation(damping=0.5,
                                max_iter=250,
                                affinity='euclidean')
    model.fit(X)
    clust_labels2 = model.predict(X)
    return (clust_labels2)
Пример #13
0
    def optimize_recommend(self, param_set,
                           max_recommend=3,
                           gamma=1.0, delta=1.0,
                           gpr=None, Xd=None,
                           return_data=False):
        """Optimizes GPR model, using each data point as initial value
        
        Clusters the result using Affinity Propagation, and returns
        the cluster representatives, choosing the number of clusters
        automatically. The results are decoded into parameter sets."""

        x = self.optimize(gamma=gamma, delta=delta, gpr=gpr, Xd=Xd)
        aff = AffinityPropagation()
        aff.fit(x)
        #x_rec = pd.DataFrame(aff.cluster_centers_, columns=x.columns)
        # select the lowest validation loss from each cluster
        x_rec = pd.concat([x, pd.DataFrame({'cluster_id' : aff.labels_})], axis=1)
        x_rec.sort_values(by=['cluster_id', 'gpr_optimum'], inplace=True)
        x_rec = x_rec.groupby('cluster_id').first()
        x_rec.sort_values(by=['gpr_optimum'], inplace=True)
        
        if max_recommend < 1:
            max_recommend = x.shape[0]
        x_rec = x.iloc[:max_recommend]
        #x_rec.index = range(len(x_rec))
        #x_rec = x_rec.drop(['gpr_optimum'], axis=1)
        paramdictlist = self.decode_dummies(x_rec, param_set)
        if return_data:
            return paramdictlist, x_rec
        else:
            return paramdictlist
def get_region2label_table(X, clutter, damping, metric='cosine'):
    '''
    metric: cosine | iou
    '''
    # compute affinity
    if metric == 'cosine':
        A = cosine_similarity(X)
        A = A / 2. + .5
    elif metric == 'iou':
        raise RuntimeError

    pref = np.percentile(A, clutter)

    # bbox clustering
    af = AffinityPropagation(preference=pref,
                             affinity='precomputed',
                             damping=damping)
    af.fit(A)

    # p(l|r)
    # mat of N_label x N_region
    Tcr = A[:, af.cluster_centers_indices_]
    Tcr /= Tcr.sum(axis=1, keepdims=True)
    Tcr = Tcr.T

    return Tcr
Пример #15
0
def affinity_propagation(words, algo="word2vec", use_model=False):
    """
        Uses wordnet similarity to cluster the words in the sentences
        :param words: input sentence
        :return: two lists which correspond the clusters
        """

    words = semantic_similarity.pos_filter(words, False, strict=False)
    words = np.asarray(words)  # So that indexing with a list will work
    if algo == "word2vec":
        lev_similarity = np.array([[semantic_similarity.word2vec_distance(w1, w2, use_model=use_model)
                                    for w1 in words] for w2 in words])

    if algo == "wordnet":
        lev_similarity = np.array([[semantic_similarity.word2vec_distance(w1, w2) for w1 in words] for w2 in words])

    if len(lev_similarity) < 2:
        return [[], []]
    affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(lev_similarity)
    if np.isnan(np.sum(affprop.labels_)):
        print "No labels"
        return [[], []]

    clusters = []
    flattened_cluster = []
    centroids = []
    for cluster_id in np.unique(affprop.labels_):
        exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
        centroids.append(words[affprop.cluster_centers_indices_[cluster_id]])
        cluster = np.unique(words[np.nonzero(affprop.labels_ == cluster_id)])
        clusters.append(list(cluster))
        flattened_cluster.extend(cluster)

    return clusters, centroids
Пример #16
0
def get_labels(data_as_list, algorithm='meanshift'):
    dt = np.array(data_as_list)
    labels = []

    print('    Algorithm =', algorithm)

    if algorithm == 'dbscan':
        dbs = DBSCAN(eps=0.1)
        dbs.fit(dt)
        labels = dbs.labels_

    if algorithm == 'kmeans':
        kmeans = KMeans(n_clusters=10)
        kmeans.fit(dt)
        labels = kmeans.labels_

    if algorithm == 'meanshift':
        # The following bandwidth can be automatically detected using
        try:
            bandwidth = estimate_bandwidth(dt, quantile=0.2, n_samples=len(dt))
        except:
            bandwidth = 0.5
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(dt)
        labels = ms.labels_

    if algorithm == 'affinitypropagation':
        af = AffinityPropagation()
        af.fit(dt)
        labels = af.labels_

    return labels
Пример #17
0
def APWithSimilaryMatrix(similaryMatrix):
    p = np.mean(similaryMatrix) * 2
    af = AffinityPropagation(max_iter=2000,
                             preference=p,
                             affinity='precomputed')
    af.fit(similaryMatrix)
    return (af.cluster_centers_indices_, af.labels_)
Пример #18
0
def clusterSimilarityWithSklearnAPC(data_file,damping=0.9,max_iter=200,convergence_iter=15,preference='min'):
    """
    Compare Sparse Affinity Propagation (SAP) result with SKlearn Affinity Propagation (AP) Clustering result.
    Please note that convergence condition for Sklearn AP is "no change in the number of estimated clusters",
    for SAP the condition is "no change in the cluster assignment". 
    So SAP may take more iterations and the there will be slightly difference in final cluster assignment (exemplars for each sample).
    """
    # loading data
    simi_mat=loadMatrix(data_file)
    simi_mat_dense=simi_mat.todense()

    # get preference
    if preference=='min':
        preference=np.min(simi_mat_dense)
    elif preference=='median':
        preference=np.median(simi_mat_dense)
    
    print('{0}, start SKlearn Affinity Propagation'.format(datetime.now()))
    af=AffinityPropagation(damping=damping, preference=preference, affinity='precomputed',verbose=True)
    af.fit(simi_mat_dense)
    cluster_centers_indices,labels = af.cluster_centers_indices_,af.labels_
    sk_exemplars=np.asarray([cluster_centers_indices[i] for i in labels])
    print('{0}, start Fast Sparse Affinity Propagation Cluster'.format(datetime.now()))
    sap=SAP(preference=preference,convergence_iter=convergence_iter,max_iter=max_iter,damping=damping,verboseIter=100)
    sap_exemplars=sap.fit_predict(simi_mat_dense)
    
    # Caculate similarity between sk_exemplars and sap_exemplars
    exemplars_similarity=sparseAP_cy.arrSamePercent(np.array(sk_exemplars), np.array(sap_exemplars))
    
    return exemplars_similarity
Пример #19
0
def Affinity_Propagation(data, SBS, C, EP, CP, selected_products):
    ap = AffinityPropagation(preference=-200)
    ap.fit(data)

    n_clusters = len(ap.cluster_centers_)

    EP_Length = len(EP)
    # list of lists
    arr = [[] for i in range(n_clusters)]
    for i, j in enumerate(ap.labels_):
        arr[j].append(i)

    cluster_nos_of_selected_products = [
        ap.labels_[i] for i in selected_products
    ]

    # Run over the cluster from which majority of the products have been selected previously.
    cluster = max(set(cluster_nos_of_selected_products),
                  key=cluster_nos_of_selected_products.count)

    EP_New, CP_New = [], []
    for i in arr[cluster]:
        if i < EP_Length:
            EP_New.append(i)
        else:
            CP_New.append(i)

    return EP_New, CP_New, n_clusters
Пример #20
0
def make_cluster_map(damping=0.992):
    test_labels, prediction = pickle.load(open(f_path_pred, 'rb'))
    prob_conf = np.zeros((121, 121))
    for l in range(121):
        inds = np.squeeze(np.array(np.where(test_labels == l)))
        class_conf = prediction[inds, :].mean(axis=0)
        prob_conf[l, :] = class_conf
    F = prob_conf
    D = (1 - F)
    np.fill_diagonal(D, 0)
    D_p = 0.5 * (D + D.T)

    clst = AP(
        damping=damping,  # damping determines # of clusters
        max_iter=500,
        convergence_iter=15,
        affinity='euclidean',
        verbose=False)
    clst.fit(D_p)
    print 'Number of cluster:', len(clst.cluster_centers_)
    membership = np.c_[range(121), clst.labels_]

    fine_to_coarse = dict(membership)
    coarse_to_fine = {l: [] for l in clst.labels_}
    for k, v in fine_to_coarse.items():
        coarse_to_fine[v].append(k)

    pickle.dump(coarse_to_fine,
                open(os.path.join(curdir, 'coarse_to_fine.p'), 'wb'))
    pickle.dump(fine_to_coarse,
                open(os.path.join(curdir, 'fine_to_coarse.p'), 'wb'))
    def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time):

        BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING
        END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING

        data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X]

        labels = None
        if clusterType == 'kmeans':
            kmeans = KMeans(n_clusters=N_CLUSTERS)
            kmeans.fit(data)
            labels = kmeans.labels_
        elif clusterType == 'affinity_propagation':
            ap = AffinityPropagation(damping=0.75)
            ap.fit(data)
            labels = ap.labels_
            N_CLUSTERS = np.max(self.labels)+1
        elif clusterType == 'DBSCAN':
            dbscan = DBSCAN()
            dbscan.fit(data)
            labels = dbscan.labels_
            N_CLUSTERS = np.max(labels)+1
            print 'N_CLUSTERS=' + str(N_CLUSTERS)
        elif clusterType == 'AgglomerativeClustering':
            ac = AgglomerativeClustering(n_clusters=N_CLUSTERS)
            ac.fit(data)
            labels = ac.labels_
        else:
            print 'ERROR: clusterType: ' + clusterType + ' is not recognized'

        return (labels, N_CLUSTERS)
Пример #22
0
def test_sparse_input_for_predict():
    # Test to make sure sparse inputs are accepted for predict
    # (non-regression test for issue #20049)
    af = AffinityPropagation(affinity="euclidean", random_state=42)
    af.fit(X)
    labels = af.predict(csr_matrix((2, 2)))
    assert_array_equal(labels, (2, 2))
Пример #23
0
def affinity_propagation(feature_matrix):
    sim = feature_matrix * feature_matrix.T
    sim = sim.todense()
    ap = AffinityPropagation()
    ap.fit(sim)
    clusters = ap.labels_
    return ap, clusters
def get_clustered_data(data_matrix,
                       clustering_algorithm=model_constants.KMEANS,
                       distance_metric='euclidean',
                       num_clusters=3):
    if clustering_algorithm.lower() == model_constants.AFFINITY_PROP:
        aff_prop = AffinityPropagation(affinity=distance_metric)
        aff_prop.fit(data_matrix)
        return aff_prop.labels_, aff_prop
    elif clustering_algorithm.lower() == model_constants.DBSCAN:
        dbscan = DBSCAN(metric=distance_metric)
        dbscan.fit(data_matrix)
        return dbscan.labels_, dbscan
    elif clustering_algorithm.lower() == model_constants.OPTICS:
        optics = OPTICS(metric=distance_metric)
        optics.fit(data_matrix)
        return optics.labels_, optics
    elif clustering_algorithm.lower() == model_constants.MEANSHIFT:
        mean_shift = MeanShift()
        mean_shift.fit(data_matrix)
        return mean_shift.labels_, mean_shift
    elif clustering_algorithm.lower() == model_constants.BIRCH:
        birch = Birch(n_clusters=num_clusters)
        birch.fit(data_matrix)
        return birch.labels_, birch
    elif clustering_algorithm.lower() == model_constants.AGGLOMERATIVE:
        agglomerative = AgglomerativeClustering(n_clusters=num_clusters,
                                                affinity=distance_metric)
        agglomerative.fit(data_matrix)
        return agglomerative.labels_, agglomerative
    else:
        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(data_matrix)
        return kmeans.labels_, kmeans
Пример #25
0
def cluster(mat, doc_indices):
    X = mat[:, doc_indices].T
    # Other clustering algorithms can easily be swapped in:
    # http://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster
    clust = AffinityPropagation()
    clust.fit(X)
    return zip(doc_indices,  clust.labels_)
Пример #26
0
def ward_method_clustering(nodes):
    """
    Performs agglomerative hierarchical clustering  of user or transaction addresses with similar behavior patterns.

    :param nodes: The nodes of the network graph
    :return: dict: A dictionary of addresses where keys are the cluster labels and values are members of the same
    cluster
    """

    result = []
    levenshtein_distances = -1 * np.array(
        [[levenshtein_distance(w1, w2) for w1 in nodes] for w2 in nodes])
    affinity_propagation = AffinityPropagation(affinity="precomputed",
                                               damping=0.5)
    affinity_propagation.fit(levenshtein_distances)

    cluster_center_indices = affinity_propagation.cluster_centers_indices_
    unique_labels = np.unique(affinity_propagation.labels_)

    for cluster_id in unique_labels:
        cluster_list = []
        for index, node in enumerate(nodes):
            if index == cluster_center_indices[cluster_id]:
                exemplar = node
                list_of_names = np.nonzero(
                    affinity_propagation.labels_ == cluster_id)
                for i in list_of_names[0]:
                    if index == i:
                        cluster_list.append(node)
                cluster = np.unique(cluster_list)
                # cluster_str = ", ".join(cluster)
        result[exemplar] = cluster

    return result
Пример #27
0
def affinity_propagation(principal_components, principal_df):
    final_df = pd.concat([principal_df], axis=1)
    model = AffinityPropagation(damping=0.9, random_state=0)
    # fit the model
    model.fit(principal_components)
    # assign a cluster to each example
    y_hat = model.predict(principal_components)
    # retrieve unique clusters
    clusters = unique(y_hat)
    final_df['Segment'] = model.labels_
    # create scatter plot for samples from each cluster
    for cluster in clusters:
        # get row indexes for samples with this cluster
        row_ix = where(y_hat == cluster)
        # create scatter of these samples
        plt.scatter(principal_components[row_ix, 0],
                    principal_components[row_ix, 1],
                    s=75)
    final_df.rename({
        0: 'PC1',
        1: 'PC2',
        2: 'PC3',
        'y': 'Race'
    },
                    axis=1,
                    inplace=True)
    plt.title("Affinity Propagation")
    add_race_labels(final_df)
    calc_silhouette(data=principal_components,
                    prediction=y_hat,
                    n_clusters=len(clusters))
    return final_df
Пример #28
0
    def __dtw_clustering(self, seq_f):
        ### Clustering sequences using affinity propagation, dtw
        ### Computing similarity/affinity matrix using dtw
        p_dist = np.zeros((len(seq_f), len(seq_f)))
        if isinstance(seq_f[0], tuple):
            seq = [item[0] for item in seq_f]
            freq = np.array([item[1] for item in seq_f])
        else:
            seq = seq_f

        for i in range(len(seq)):
            for j in range(i, len(seq)):
                p_dist[i][j] = self.__pattern_distance(seq[i], seq[j])
                if i != j:
                    p_dist[j][i] = p_dist[i][j]

        p_dist_max = np.max(p_dist)
        if p_dist_max == 0:
            p_dist_max = 2
        p_dist = p_dist_max - p_dist

        ### Affinity Propagation
        freq = 2 * p_dist_max * freq / max(freq)
        ap = AffinityPropagation(affinity='precomputed', preference=freq)
        ap.fit(p_dist)

        ### Arranging sequences by cluster label
        cluster_subseqs = dict()
        for seq, label in zip(seq_f, ap.labels_):
            if label not in cluster_subseqs:
                cluster_subseqs.update({label: [seq]})
            else:
                cluster_subseqs[label].append(seq)

        return cluster_subseqs
Пример #29
0
def affinitypropagation(params): 
    distance_path=''
    distance_path+=params["distance_path"]
    print(distance_path)
    distance=np.loadtxt(distance_path,dtype=np.float32)
    print(distance.shape)
    delta=2
    affinity=np.exp(-distance ** 2/ (2. * delta ** 2))

    #using default values, set metric to 'precomputed'
    aff=AffinityPropagation(affinity='precomputed')
    print(aff)

    aff.fit(affinity)
    #get labels
    labels = aff.labels_

    print(labels,labels.shape)
    #get number of clusters
    no_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print(no_clusters,"no_clusters")

    #for i in range(no_clusters):
        #print('Cluster  : ', np.nonzero(labels == i)[0])

    #print(type(labels))
    return_val=tuple(labels.tolist())
    #print(type(return_val))
    return return_val
def affinity_propagation(dataset,
                         axis,
                         preference,
                         affinity,
                         damping=0.5,
                         max_iter=200,
                         convergence_iter=15,
                         copy=True,
                         verbose=False):
    """
    Helper around sk-learn AffinityPropagation function.
    """

    af = AffinityPropagation(damping=damping,
                             max_iter=max_iter,
                             convergence_iter=convergence_iter,
                             copy=copy,
                             preference=preference,
                             affinity=affinity,
                             verbose=verbose)

    if axis == 0:
        af.fit(dataset.T)
    elif axis == 1:
        af.fit(dataset)

    return af
    def cluster(self, feat_mtx, df_lm_allusers):
        # clustering artists based on AffinityPropogation
        start = time.time()
        af = AffinityPropagation()
        af.fit(feat_mtx)
        self.labels = af.labels_
        self.af = af

        # adding cluster labels to least misery dataframe and sorting by rank and cluster
        #df_least_misery_clustered = self.df_least_misery.copy() --> changing to df_lm_allusers
        print 'number of labels: ', len(self.labels)
        print 'labels', self.labels
        
        # print 'least misery clustered length', len(df_least_misery_clustered)
        
        df_least_misery_clustered = df_lm_allusers.copy()
        print 'len df least misery: ', len(df_least_misery_clustered)
        
        df_least_misery_clustered['cluster'] = self.labels
        df_least_misery_clustered[['cluster', self.score_col]] = df_least_misery_clustered[['cluster', self.score_col]].astype(float)
        ''' will do different sorting if not using rank '''
        # now set to false as looking for highest score
        df_least_misery_clustered = df_least_misery_clustered.sort(['cluster', self.score_col], ascending = False)
        self.df_least_misery_clustered = df_least_misery_clustered
        end = time.time()
        print 'clustering completed in: ', end - start  
        return df_least_misery_clustered
Пример #32
0
def reduce(lines):
    if lines is not None:
        af = AffinityPropagation(preference=-.01)
        af.fit(lines[:, 0] / np.array([[300, 1]]))

        real_lines = af.cluster_centers_ * np.array([[300, 1]])
        return np.expand_dims(real_lines, 1)
Пример #33
0
def cluster_analyze(dataframe, cluster_type='KMeans', n_clusters=None):

    # coloured area plots ??)
    from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation, SpectralClustering, Birch
    from sklearn.metrics import silhouette_samples, silhouette_score

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import numpy as np
    import time

    df_mat = dataframe.as_matrix()
    if cluster_type == 'KMeans':
        assert n_clusters, "Number of clusters argument mandatory"
        cluster_callable = KMeans
        # seed of 10 for reproducibility.
        clusterer = cluster_callable(n_clusters=n_clusters, random_state=10)
    elif cluster_type == 'dbscan':
        assert not n_clusters, "Number of clusters irrelevant for cluster type : %s" % (
            cluster_type)
        cluster_callable = DBSCAN
        clusterer = cluster_callable(eps=0.5)
    elif cluster_type == 'affinity_prob':
        assert not n_clusters, "Number of clusters irrelevant for cluster type : %s" % (
            cluster_type)
        clusterer = AffinityPropagation(damping=.9, preference=-200)
    elif cluster_type == 'spectral':
        assert n_clusters, "Number of clusters argument mandatory"
        clusterer = SpectralClustering(n_clusters=n_clusters,
                                       eigen_solver='arpack',
                                       affinity="nearest_neighbors")
    elif cluster_type == 'birch':
        assert not n_clusters, "Number of clusters irrelevant for cluster type : %s" % (
            cluster_type)
        clusterer = Birch(n_clusters=2)
    else:
        raise "Unknown clustering algorithm type"
    plt.figure(figsize=(2 + 3, 9.5))
    colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
    colors = np.hstack([colors] * 20)
    #plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,hspace=.01)
    t0 = time.time()
    clusterer.fit(df_mat)
    t1 = time.time()
    if hasattr(clusterer, 'labels_'):
        y_pred = clusterer.labels_.astype(np.int)
    else:
        y_pred = clusterer.predict(df_mat)
    dataframe['y_pred'] = y_pred
    # plot
    plt.title(cluster_type, size=18)
    plt.scatter(df_mat[:, 0],
                df_mat[:, 1])  # color=colors[y_pred].tolist(), s=10)

    if hasattr(clusterer, 'cluster_centers_'):
        centers = clusterer.cluster_centers_
        center_colors = colors[:len(centers)]
        plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
    plt.show()
def affinity_propagation(feature_matrix):
    
    sim = feature_matrix * feature_matrix.T
    sim = sim.todense()
    ap = AffinityPropagation()
    ap.fit(sim)
    clusters = ap.labels_          
    return ap, clusters
def affinity_propagation(X, args={}):
    """
    AffinityPropagation聚类:图聚类的一种
    """
    from sklearn.cluster import AffinityPropagation
    model = AffinityPropagation(**args)
    model.fit(X)
    return model
Пример #36
0
def ap(X):
    x, params = X
    x[np.isnan(x)] = 0.0
    x[np.isinf(x)] = 0.0
    x = x - 1
    af = AffinityPropagation(**params)
    af.fit(-x)
    labs = af.labels_
    return labs
def get_partition(matrix, preference, damping=0.75):

    cl = AffinityPropagation(damping=damping,
                             affinity='precomputed',
                             preference=preference)

    cl.fit(matrix)
    partition = cl.labels_
    return partition
Пример #38
0
def getNumClusters(doc_vectors):
    '''
    Given a list of document vectors as returned by makeDocumentVectors,
    this function runs affinity propogation on the vectors to approximate
    the number of clusters the documents would fall into
    '''
    clf = AffinityPropagation()
    clf.fit(doc_vectors)
    return len(clf.cluster_centers_indices_)
Пример #39
0
    def affinityClustering(series):
        vectors = series.tolist()
        #Clustering
        affinity = AffinityPropagation()
        affinity.fit(vectors)

        #Cluster
        y_affinity = affinity.predict(vectors)
        return y_affinity
Пример #40
0
def affinity_propagation(feature_matrix):
    '''
    Affinity propagation clustering
    '''

    ap = AffinityPropagation()
    ap.fit(feature_matrix.todense())
    clusters = ap.labels_
    return ap, clusters
Пример #41
0
    def cluster_prop(self, filtered_data):
        prop_dict={}

        for review in filtered_data:
            for dicti in review['line']:
                if not prop_dict.has_key(dicti["prop"][0]):
                    prop_dict[dicti["prop"][0]]={"freq":0,"data":[],"idx":[]}

                prop_dict[dicti["prop"][0]]['idx'].append(review['index'])
                prop_dict[dicti["prop"][0]]["freq"] += 1
                prop_dict[dicti["prop"][0]]["data"].append(dicti)

        d_list=[]
        word_list=[]

        for word in prop_dict:
            try:
                d_list.append(self.wmodel[word])
                word_list.append(word)
            except:
                pass

        Aprop = AffinityPropagation(damping=0.6, convergence_iter=100, max_iter=10000)
        Aprop.fit(d_list)
        cluster_dict = {}

        for idx, each in enumerate(Aprop.labels_):
            vec = d_list[idx]
            if not cluster_dict.has_key(each):
                cluster_dict[each] = {"word":[],"freq":0,"seed":"","sim":0.0}
            cluster_dict[each]["word"].append(word_list[idx])

        total_freq=0

        for each in cluster_dict.keys():
            target_group_id = each
            group_id = each

            last_group_id = target_group_id

            cluster_freq=0
            max_seed=""
            max_freq=0

            for idx,data in enumerate(cluster_dict[each]["word"]):
                cluster_freq+=prop_dict[data]["freq"]
                if prop_dict[data]["freq"] > max_freq:
                    max_freq=prop_dict[data]["freq"]
                    max_seed=data

            cluster_dict[each]["freq"]=cluster_freq
            cluster_dict[each]["seed"]=max_seed

        return (cluster_dict, prop_dict, Aprop)
Пример #42
0
def clustering_affinity_propagation(data_res):
    """
    Executes sklearn's affinity propagation function with the given data frame
    """
    af = AffinityPropagation()
    af.fit(data_res)

    predictions = af.predict(data_res)
    cluster_centers = af.cluster_centers_

    return predictions, cluster_centers, af
def affinityprop(lngs, lats, city, cluster_diameter):
	city_area = city["area"]
	city_lng = city["lng"]
	city_lat = city["lat"]
	lngs = np.array(lngs)#*(math.cos(city["lat"])**2)

	affinity = AffinityPropagation(damping=0.75, max_iter=200, convergence_iter=15, copy=True, preference=None, affinity='euclidean', verbose=False)
	affinity.fit(np.array([lngs, lats]).transpose())
	cluster_labels = np.array(affinity.labels_)

	return labels_to_index(cluster_labels)
Пример #44
0
def cluster_concepts(context="location"):
    """
	Cluster related concepts of a specific type to different categories
	"""
    db = Database()
    concept_category = ConceptCategory()
    cmd = "SELECT * FROM %s" % (context)
    context_res = db.query_db(cmd)
    concept_list = []
    concept_matrix = []
    for item in context_res:
        concept_list = []
        concept_matrix = []
        if context == "action":
            context_id, context_chinese, context_name = item[:3]
        elif context == "location":
            context_id, context_name, context_chinese = item
        cmd = (
            "SELECT b.name, b.id FROM %s_concept AS a, concept AS b \
				WHERE a.%s_id = %s AND a.concept_id = b.id"
            % (context, context, context_id)
        )
        concept_res = db.query_db(cmd)
        if len(concept_res) == 0:
            continue
        for item in concept_res:
            concept, concept_id = item
            concept_vector = concept_category.concept_axes.row_named(concept)
            concept_list.append((concept_id, concept))
            concept_matrix.append(concept_vector)
            # Run affinity propogation
        S = cosine_similarity(concept_matrix, concept_matrix)
        af = AffinityPropagation()
        af.fit(S)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        count = 0
        clusters = defaultdict(list)
        for label in labels:
            clusters[concept_list[cluster_centers_indices[label]][1]].append(concept_list[count])
            count += 1
        category_num = 0
        for key, value in clusters.items():
            category_num += 1
            for concept in value:
                cmd = (
                    "UPDATE %s_concept SET category = %d WHERE \
						%s_id = %s AND concept_id = %s"
                    % (context, category_num, context, context_id, concept[0])
                )
                db.query_db(cmd)
                print concept[1].encode("utf-8") + " ",
            print ""
        print "----------" + context_chinese.encode("utf-8") + "----------"
Пример #45
0
def train_model( X,  quantile, shift = 0, isKernel = False):
    if isKernel == False:
        preference = np.percentile(X,q = quantile)-shift
        model_affinityPropagation = AffinityPropagation(preference = preference)
        model_affinityPropagation.fit(X)
        return model_affinityPropagation
    else:
        kernel = pairwise_kernels(X,metric="rbf")
        preference = np.percentile(X,q = quantile)-shift
        model_affinityPropagation = AffinityPropagation(affinity='precomputed',preference = np.percentile(kernel,q = 0.318))
        model_affinityPropagation.fit(kernel)
        return model_affinityPropagation        
Пример #46
0
	def do_issue(data, data_name):
		reduced_points, labels, km = reduce_npoints_kmeans(dataframe = data, dataset_name = dataset, data_name=data_name, n_datapoints = 1000, load_from_file = False)
		transformed_data, pca, components = calculate_pca(reduced_points, n_components=3)
		colormap = brewer2mpl.get_map('RdBu', 'diverging', 4, reverse=True)
		filename = figure_save_path + dataset + '_issue_29_1_%s_reduced_number_of_points.png'%data_name
		print "Making scatter plot of %s data for dataset %s, where the number of points have been reduced by K-Means clustering"%(data_name, dataset)
		make_color_grouped_scatter_plot(data_frame=transformed_data, x_name='d1', y_name='d2', color_by='d3', filename=filename, colormap=colormap)

		ap = AffinityPropagation(damping=affinity_damping)
		ap.fit(reduced_points)
		print "Making scatter plot of Affinity Propagation clusters of %s data for dataset %s"%(data_name, dataset)
		filename = figure_save_path + dataset + '_issue_29_2_%s_affinity.png'%data_name
		make_scatter_plot_for_labelled_data(data_frame=transformed_data, x_name='d1', y_name='d2', labels=ap.labels_, filename=filename, colormap = colormap, legend=True)	
    def cluster(self, feat_mtx):
        # clustering artists based on AffinityPropogation
        af = AffinityPropagation()
        af.fit(feat_mtx)
        self.labels = af.labels_
        self.af = af

        # adding cluster labels to least misery dataframe and sorting by rank and cluster
        df_least_misery_clustered = self.df_least_misery.copy()
        df_least_misery_clustered['cluster'] = self.labels
        df_least_misery_clustered[['cluster', self.score_col]] = df_least_misery_clustered[['cluster', self.score_col]].astype(float)
        ''' will do different sorting if not using rank '''
        df_least_misery_clustered = df_least_misery_clustered.sort(['cluster', self.score_col])
        return df_least_misery_clustered
Пример #48
0
    def affinity_propagation(self, affinity_matrix=None, sigma=1, **kwargs):
        """

        :param kwargs: damping=0.5, max_iter=200, convergence_iter=15, copy=True, preference=None, verbose=False
        :return:
        """
        if affinity_matrix is None:
            aff = rbf(self.dm.values, sigma)
        else:
            aff = affinity_matrix

        est = AffinityPropagation(affinity='precomputed', **kwargs)
        est.fit(aff.view(np.ndarray))
        return Partition(est.labels_)
Пример #49
0
def loadKmeansData(dataArrayTest,dataArrayTrain,k,m='load'):
    if m=='load':
        centroidRead=open('centroid','r')
        labelClusterRead=open('labelCluster','r')
        labelPreRead=open('labelPre','r')
        centroid=pickle.load(centroidRead)
        labelCluster=pickle.load(labelClusterRead)
        labelPre=pickle.load(labelPreRead)
    else:
        dataArrayTestNorm = preprocessing.normalize(dataArrayTest)
        dataArrayTrainNorm = preprocessing.normalize(dataArrayTrain)
        #clf=MiniBatchKMeans(init='k-means++', n_clusters=k, n_init=10)
        clf=AffinityPropagation()
        #clf=DBSCAN(min_samples=30)
        pre=clf.fit(dataArrayTrainNorm)


        centroid=pre.cluster_centers_
        centroidWrite=open('centroid','w')
        #pickle.dump(centroid,centroidWrite)

        labelCluster=pre.labels_
        labelClusterWrite=open('labelCluster','w')
        #pickle.dump(labelCluster,labelClusterWrite)

        labelPre=clf.predict(dataArrayTestNorm)
        labelPreWrite=open('labelPre','w')
        #pickle.dump(labelPre,labelPreWrite)

    return centroid,labelCluster,labelPre
Пример #50
0
    def create_stratum(self, column_names, **kwargs):
        '''
        Use affinity propagation to find number of strata for each column. 
        column_names is a list of the covariates to be split into strata and 
        used for classification. This funciton adds a column to the data frame
        for each column as column_name_strata that gives the strata designation
        for that variable.  The whole data frame is returned.
        '''

        for colname in column_names:
            X = self.data[colname].reshape(-1, 1)
            
            if np.isnan(X).any():
                raise ValueError("There are NaN values in self.data[%s] that the \
                                  clustering algorithm can't handle" % colname)
                                  
            elif np.unique(self.data[colname]).shape[0] <=2:
                string_name = colname+'_strata'
                self.data[string_name] = self.data[colname].astype(int)
        
            else:
                af_model = AP(damping = 0.9)
                strata_groups = af_model.fit(X)
                
                #cluster_centers_indices = af.cluster_centers_indices_
                #n_clusters_ = len(cluster_centers_indices)
                
                string_name = colname+'_strata'
                self.data[string_name] = strata_groups.labels_
                
        return self.data
Пример #51
0
def affinity_propagation_cluster_analysis(x,y,preference):
    # NOT WORKING BECAUSE I DONT REALLY UNDERSTAND WHAT IT DOES...
    # ADAPTED FROM:
    # http://scikit-learn.org/stable/auto_examples/cluster/plot_affinity_propagation.html#example-cluster-plot-affinity-propagation-py
    X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1))))
    af = AffinityPropagation()
    af = af.fit(X)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    #print("number of estimated clusters : %d" % n_clusters_)
    colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for i in xrange(len(np.unique(labels))):
        my_members = labels == i
        cluster_center = X[cluster_centers_indices[i]]
        plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7)
        plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i])
        for j in X[my_members]:
            plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]],c=colors[i],linestyle='--')
    tolx = (X[:,0].max()-X[:,0].min())*0.03
    toly = (X[:,1].max()-X[:,1].min())*0.03
    plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx)
    plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly)
    plt.show()
    return labels
def affinity_propagation(x, damping=0.9):
  ap = AffinityPropagation(
    damping=damping, 
    max_iter=400, 
    convergence_iter=30, 
    copy=True, 
    preference=None, 
    affinity='euclidean', 
    verbose=False
  )
  ap.fit(x)
  centroids = ap.cluster_centers_
  c = ap.labels_
  k = len(centroids)
  
  return ap, (centroids, c, k)
def AlloyClustering(k):
    alloy_data = data_parser.parse("../../AlloyComps.csv")
    data = np.asarray(alloy_data.get_data(["Cu","Ni","Mn","P","Si","C"]))
    #est = KMeans(n_clusters=k)
    #est = AgglomerativeClustering(n_clusters = k)
    est = AffinityPropagation()
    est.fit(data)

    labels = est.labels_
    '''print(len(labels))
    for i in range(k):
        print("Cluster #{}".format(i))
        print(np.asarray(alloy_data.get_data("Alloy"))[np.where(labels == i)])
        print()'''

    return (labels,alloy_data)
	def runAffinityPropagation(self):
		'''
			This function runs the affinity propagation algorithm
		'''
		distMatrix = distance.squareform(distance.pdist(self.coordinates, 'cosine'))
		size = distMatrix.shape
		for i in range(size[0]):
			for j in range(size[1]):
				distMatrix[i,j] = 2 - distMatrix[i,j]
		model = AffinityPropagation(damping = self.damping, max_iter = self.max_iter,affinity = 'precomputed')
		model.fit(distMatrix)
		self.center_id = model.cluster_centers_indices_.tolist()
		belongs = model.labels_.tolist()
		for i in range(len(belongs)):
			self.assignments[i]['assignment'] = 'centroid_' + str(belongs[i] + 1)
		self.silhouetteScore = metrics.silhouette_score(distMatrix, model.labels_, metric = 'cosine')
		trueLabel = dataProcessing.getTrueLabel(self.assignments)
		self.adjustedScore = metrics.adjusted_rand_score(belongs, trueLabel)
Пример #55
0
def dataset_fringes(X, cluster_algo, min_compression=64):
    if cluster_algo =='none' or len(X) <= min_compression:
        return X
    elif cluster_algo == 'AffinityPropagation':
        algo = AffinityPropagation()
        D = -spsp.distance.squareform(sp.spatial.distance.pdist(X))
        algo.fit(D)
        return X[algo.cluster_centers_indices_]
    elif cluster_algo == 'DBSCAN':
        algo = DBSCAN(metric='precomputed', min_samples=2)
        D = -spsp.distance.squareform(sp.spatial.distance.pdist(X))
        labels = algo.fit(D).labels_
        return NearestCentroid().fit(X, labels).centroids_
    elif cluster_algo == 'svm_outlier':
        algo = svm.OneClassSVM(nu=0.95 * 0.25 + 0.05,
                               kernel="rbf") #, gamma=0.1)
        #UNFINISHED!!!
    else:
        print("BOH")
def affinity_umi_removal(molecular_barcodes, _):
    """
    Tries to finds clusters of similar UMIs using an affinity based approach. 
    It returns a list with all the non clustered UMIs, for clusters of 
    multiple UMIs a random one will be selected.
    :param molecular_barcodes: a list of UMIs
    :return: a list of unique UMIs
    :rtype: list
    """
    if len(molecular_barcodes) <= 2:
        return countUMINaive(molecular_barcodes, allowed_mismatches)
    words = np.asarray(molecular_barcodes)
    lev_similarity = -1 * np.array([[hamming_distance(w1,w2) for w1 in words] for w2 in words])
    affprop = AffinityPropagation(affinity="precomputed", damping=0.5)
    affprop.fit(lev_similarity)
    unique_clusters = list()
    for cluster_id in np.unique(affprop.labels_):
        exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
        cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
        unique_clusters.append(random.choice(cluster))
    return unique_clusters
Пример #57
0
    def _internal(preferences, affinity_matrix, dist_matrix,
                  idx, n_jobs, n, queue_y):
        for i in range(idx, n, n_jobs):
            ap = AffinityPropagation(preference=preferences[i],
                                     affinity='precomputed',
                                     max_iter=500)
            ap.fit(affinity_matrix)

            cluster_labels = ap.labels_.copy()
            nclusts = np.unique(cluster_labels).shape[0]
            save_results_clusters("res_ap_{:03d}_clust.csv"
                                  .format(nclusts),
                                  sample_names, ap.labels_)

            if nclusts > 1:
                try:
                    silhouette_list = silhouette_samples(dist_matrix, ap.labels_,
                                                     metric="precomputed")
                    queue_y[i] = np.mean(silhouette_list)
                except BaseException:
                    print(dist_matrix.shape, ap.labels_.shape)
Пример #58
0
def get_label_res2(similar_matrix, n_subs):

    cluster = AffinityPropagation(damping = 0.75 , affinity = 'precomputed') # preference = -1000)# n_clusters = n_subs, affinity = 'precomputed')

    res = cluster.fit(similar_matrix)

    size_labels = len(set(res.labels_))
    assert size_labels < 10, size_labels
    assert size_labels > 1, size_labels

    print res.labels_
    return res.labels_
def compute_threshold(affmat):
    """
    This function uses affinity propagation to cluster the sequences, and then
    computes minimum of minimum in-cluster pairwise identities to be used as a
    threshold value.
    """
    ap = AffinityPropagation(affinity='precomputed')
    ap.fit(affmat)

    clusters = pd.DataFrame([i for i in zip(affmat.index, ap.labels_)])
    clusters = clusters.set_index(0)
    clusters.columns = ['Cluster']

    minval = 1
    for group in clusters.groupby('Cluster'):
        accessions = group[1].index
        subset = affmat[accessions].loc[accessions, :]

        if np.matrix(subset).min() < minval:
            minval = np.matrix(subset).min()

    return minval