示例#1
0
    def __init__(self,
                 categories,
                 replacement,
                 selection=None,
                 lam=500,
                 theta_init=None):
        """
        Parameters
        ----------
        replacement : eda.optimizer.replacement.replacement_base.ReplacementBase
            Replacement method.
        selection : eda.optimizer.selection.selection_base.SelectionBase, default None
            Selection method.
        """
        super(AffEDA, self).__init__(categories,
                                     lam=lam,
                                     theta_init=theta_init)
        self.replacement = replacement
        self.selection = selection

        self.population = None
        self.fitness = None
        self.cluster = None
        self.ap = cluster.AffinityPropagation(affinity="precomputed",
                                              random_state=0)
示例#2
0
    def do_algo(self, input):
        control_params = input.algo_control.control_params
        if not self.check_input_params(self.get_input_params_definition(), control_params):
            log.error("Check input params type error.")
            return None
        mode = input.algo_control.mode
        data = input.algo_data.data

        if mode == 'training':
            try:
                model = cluster.AffinityPropagation(
                    damping=control_params["damping"],
                    preference=control_params["preference"],
                    convergence_iter=control_params["convergence_iter"],
                    max_iter=control_params["max_iter"]
                )
                model.fit(data)

                algo_output = alc.AlgoParam(algo_control={'mode': 'training', 'control_params': ''},
                                            algo_data={'data': data, 'label': None},
                                            algo_model={'model_params': model.get_params(), 'model_instance': model})
            except Exception as e:
                log.error(str(e))
                algo_output = None
        else:
            algo_output = None
        return algo_output
def _affinity_propagation(feature, ground_truth, p_d):
    p = p_d['preference']
    d = p_d['damping_factor']
    if (p_d.get('affinity') and p_d['affinity'] == 'precomputed'):
        connectivity = kneighbors_graph(feature,
                                        n_neighbors=p_d['n_neighbors'],
                                        include_self=True)
        affinity_matrix = 0.5 * (connectivity + connectivity.T)
        affinity_matrix = np.asarray(affinity_matrix.todense(), dtype=float)
        af = cluster.AffinityPropagation(
            damping=d, affinity='precomputed').fit(affinity_matrix)
    else:
        af = cluster.AffinityPropagation(preference=p, damping=d).fit(feature)
    y_pred_af = af.labels_
    ars_af = metrics.adjusted_rand_score(ground_truth, y_pred_af)
    return ars_af
示例#4
0
    def ClusterHouses(matches, plot_groups=False):
        groups = {}
        try:
            N = len(matches)
            X = np.zeros((N, 2))
            for m in range(N):
                loc = RFAPI.house_location(matches[m])
                #logging.debug("ClusterHouses({})".format(loc))
                X[m] = (loc[0], loc[1])

            params = {
                'quantile': .3,
                'eps': .15,
                'damping': .9,
                'preference': -5,
                'n_neighbors': 2,
                'n_clusters': 5
            }

            # a bit buggy..
            spectral = cluster.SpectralClustering(
                n_clusters=params['n_clusters'],
                eigen_solver='arpack',
                affinity="nearest_neighbors")

            # best so far!
            gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                          covariance_type='full')

            # yielded one cluster..
            affinity_propagation = cluster.AffinityPropagation(
                damping=params['damping'], preference=params['preference'])

            bandwidth = cluster.estimate_bandwidth(X,
                                                   quantile=params['quantile'])
            ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

            algorithm = ms

            algorithm.fit(X)
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)
            for m in range(len(matches)):
                key = str(y_pred[m])
                if groups.get(key, None) == None:
                    groups[key] = []

                groups[key].append({
                    "adress": RFAPI.house_address(matches[m]),
                    "location": [X[m][0], X[m][1]]
                })
            logging.debug("groups = {}".format(groups))
            if plot_groups:
                HouseScore._plot_groups(X, y_pred)
        except Exception as e:
            groups["error"] = str(e)
            logging.error(groups["error"])
        return groups
示例#5
0
def get_distances(con, cur=None, compid=0):
    import scipy.sparse as sp
    from sklearn import cluster

    owners = pd.read_sql(f'select * from component where compid={compid}', con)
    ostr = ','.join([str(o) for o in owners['ownerid']])
    omap = {o: i for i, o in enumerate(owners['ownerid'])}
    owners['oid'] = owners['ownerid'].map(omap)
    nown = len(owners)

    pairs = pd.read_sql(
        f'select * from pair where ownerid1 in ({ostr}) or ownerid2 in ({ostr})',
        con)
    pairs['dist'] = pairs.apply(lambda df: affin(df['name1'], df['name2']),
                                axis=1)
    pairs['oid1'] = pairs['ownerid1'].map(omap)
    pairs['oid2'] = pairs['ownerid2'].map(omap)

    dist = pd.DataFrame(
        [(o1, o2) for o1, o2 in product(owners['ownerid'], owners['ownerid'])],
        columns=['ownerid1', 'ownerid2'])
    dist = dist.join(pairs.set_index(['ownerid1', 'ownerid2'])['dist'],
                     on=['ownerid1', 'ownerid2']).fillna(0.0)
    amat = dist['dist'].values.reshape([nown, nown])
    # amat = sp.coo_matrix((pairs['dist'],(pairs['oid1'],pairs['oid2'])))

    # fit = cluster.SpectralClustering(affinity='precomputed').fit(amat)
    fit = cluster.AffinityPropagation(affinity='precomputed').fit(amat)
    # fit = cluster.DBSCAN(metric='precomputed').fit(amat)
    nclust = np.max(fit.labels_) + 1
    cids = [np.nonzero(fit.labels_ == i)[0] for i in range(nclust)]
    cown = [owners[owners['oid'].isin(c)]['ownerid'] for c in cids]
    cname = [get_names(olist=c) for c in cown]

    return owners, pairs, cname
示例#6
0
def names_clustering(stringVect):
    '''
    Create clusters of most commonly appearing sub-strings and assign them to items passed in.

    Clustering is done on the similarity matrix, which we will call here on our input

    Requires:
        sklearn.AffinityPropagation
        fuzzywuzzy.fuzz

    Input:
        stringVect - vector of strings
    
    Output:
        dfCluster - a dataframe that contains the original stringVect inputs and their associated cluster
    '''

    # Generate the similarity matrix on input
    S = generate_similarity_score_matrix(stringVect)

    # Fit the Affinity Propagation clustering algorithm on similarity matrix, S
    clusters = cluster.AffinityPropagation(affinity='precomputed',
                                           random_state=None).fit_predict(S)

    # Create the output dataframe
    dfCluster = pd.DataFrame(list(zip(stringVect, clusters)),
                             columns=['input_names', 'cluster'])
    return dfCluster
示例#7
0
def trace_clustering(dataframe, output_path, filename):
    array = np.transpose(dataframe.values)

    clustering_method = 'KMeans'
    if clustering_method == 'AffinityPropagation':
        clustering = cluster.AffinityPropagation().fit(array)
    elif clustering_method == 'KMeans':
        clustering = cluster.KMeans(n_clusters=6).fit(array)
    labels = clustering.labels_
    print(labels)

    for cluster_group_index in np.unique(labels):

        fig = plt.figure()
        ax = fig.gca()
        trace_index_list = np.argwhere(labels == cluster_group_index)
        for count, trace_index in enumerate(trace_index_list):
            trace_length = array[trace_index, :].shape[1]
            trace = array[trace_index, :].reshape((trace_length))
            if count == 0:
                average = trace
            elif count >= 1:
                average = np.mean(np.concatenate(
                    (average, trace)).reshape(2, trace_length),
                                  axis=0)
            ax.plot(np.arange(0, len(trace), 1), trace, 'b')
        ax.plot(np.arange(0, len(average), 1), average, 'r')
        plt.show()
        fig.savefig(output_path + filename + clustering_method + '_' +
                    str(cluster_group_index) + '_cluster.png')
示例#8
0
def affinity_propagation(similiarity_matrix):
    """Perform Affinity Propagation Clustering of data

    Note: This function is a wrapper for AffinityPropagation from scikit-learn

    Source: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html

    Parameters
    ---------
    similarity_matrix: pandas DataFrame, shape (n_samples, n_samples)
        Matrix of similarities between points.

    Returns
    -------
    clusters: dictionary
        A dictionary of <sample: cluster label> key-value pairs.
    """
    clusters = {}

    labels = cluster.AffinityPropagation().fit_predict(similiarity_matrix)
    n_labels = labels.max()

    clusters = {}
    for i in range(n_labels + 1):
        for neuron in list(similiarity_matrix.columns[labels == i]):
            clusters[neuron] = i

    return clusters
示例#9
0
def investigateOptimalAlgorithms(kmerId, kmerPca):
    plot.setLibrary('bokeh')

    pca = kmerPca.loc[:, PCA_DATA_COL_NAMES]
    plots = {}
    algos = (('KMeans', cluster.KMeans()), ('Affinity',
                                            cluster.AffinityPropagation()),
             ('MeanShift',
              cluster.MeanShift()), ('Spectral', cluster.SpectralClustering()),
             ('Agglomerative',
              cluster.AgglomerativeClustering(linkage='average')),
             ('Agglomerative',
              cluster.AgglomerativeClustering(linkage='ward')),
             ('DBSCAN', cluster.DBSCAN()), ('Gaussian', GaussianMixture()))

    ## Visualise data and manually determine which algorithm will be good
    for i, (name, algo) in enumerate(algos, 1):
        labels = _getLabels(algo, pca)
        labels = pd.DataFrame(labels, columns=[CLABEL_COL_NAME])
        kmerDf = pd.concat([kmerId, pca, labels], axis=1)

        dataset = hv.Dataset(kmerDf, PCA_DATA_COL_NAMES)
        scatter = dataset.to(hv.Scatter,
                             PCA_DATA_COL_NAMES,
                             groupby=CLABEL_COL_NAME).overlay()
        scatter.opts(opts.Scatter(size=10, show_legend=True))
        plots[name] = scatter

    plots = hv.HoloMap(plots, kdims='algo')
    plots = plots.collate()
    return plots
示例#10
0
def Affinity(tfidf, cluster_list):
    affinity = cluster.AffinityPropagation(preference=10).fit(tfidf)
    labels = affinity.labels_
    # result = normalized_mutual_info_score(labels, cluster_list)
    result=v_measure_score(labels,cluster_list )

    print("the Affinity propagation cluster algorithm result is:",result)
示例#11
0
def groupROIClusters(clusters:List[ROICluster], factor=-1, normalize=True, prefFunc=lambda mat: (np.min(mat)+np.median(mat))/2) -> List[List[ROICluster]]:
    def dist(A, B):
        widthA  = A[0];  widthB = B[0]
        heightA = A[1]; heightB = B[1]
        # return (abs(widthA-widthB)+abs(heightA-heightB))/2
        return (abs(widthA-widthB)+abs(heightA-heightB))/1

        # return abs(A.parent.bbox['area'] - B.parent.bbox['area'])    
    # afmat = pdist(np.matrix([cluster.parent.bbox['area'] for cluster in clusters]).transpose(), lambda x,y:2*abs(x-y)/(x+y))
    # afmat = pdist(np.matrix([cluster.parent.bbox['area'] for cluster in clusters]).transpose(), lambda x,y:abs(x-y))
    afmat = pdist(np.matrix([
                            [cluster.parent.bbox['width'],
                            cluster.parent.bbox['height']]
                            for cluster in clusters
                            ]), dist)

    # afmat = (-100)*squareform(afmat/np.max(afmat))
    # afmat = (-100)*squareform(afmat)
    if normalize == True:
        afmat = (factor)*squareform(afmat/np.max(afmat))
    else:
        afmat = (factor)*squareform(afmat)
    
    if prefFunc:
        pref = prefFunc(afmat)
    else:
        pref = np.min(np.min(np.min(afmat)))
    ap = cluster.AffinityPropagation(affinity='precomputed', preference=pref)
    ap.fit(afmat)
    # allpoints_labels, centers_indices = (ap.labels_, ap.cluster_centers_indices_)
    groups = set()
    for label in ap.labels_:
        groups.add(frozenset([i for i,x in enumerate(ap.labels_) if x==label]))
    groups = [list(group) for group in groups]
    return [[clusters[i] for i in indexGroup] for indexGroup in groups]
示例#12
0
def cluster_model(newdata, data, model_name, input_param):
    ds = data
    params = input_param
    if str.lower(model_name) == 'kmeans':
        cluster_obj = cluster.KMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MiniBatchKMeans'):
        cluster_obj = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('SpectralClustering'):
        cluster_obj = cluster.SpectralClustering(n_clusters=params['n_clusters'])
    if str.lower(model_name) == str.lower('MeanShift'):
        cluster_obj = cluster.MeanShift(bandwidth=params['bandwidth'])
    if str.lower(model_name) == str.lower('DBSCAN'):
        cluster_obj = cluster.DBSCAN(eps=params['eps'])
    if str.lower(model_name) == str.lower('AffinityPropagation'):
        cluster_obj = cluster.AffinityPropagation(damping=params['damping'],
                                                  preference=params['preference'])
        cluster_obj.fit(ds)
    if str.lower(model_name) == str.lower('Birch'):
        cluster_obj = cluster.Birch(n_clusters=input_param['n_clusters'])
    if str.lower(model_name) == str.lower('GaussianMixture'):
        cluster_obj = mixture.GaussianMixture(n_components=params['n_clusters'],
                                              covariance_type='full')
        cluster_obj.fit(ds)

    if str.lower(model_name) in ['affinitypropagation', 'gaussianmixture']:
        model_result = cluster_obj.predict(ds)
    else:
        model_result = cluster_obj.fit_predict(ds)

    newdata[model_name] = pd.DataFrame(model_result)

    return newdata
def AffinityProp(D, pref, damp):
    aff = cluster.AffinityPropagation(affinity='precomputed',
                                      preference=pref,
                                      damping=damp,
                                      verbose=True)
    labels = aff.fit_predict(D)
    return labels
示例#14
0
def determine_source_locations_instance(r_ref, l_ref, node_events, **kwargs):
    """

    Determines the position in the probability grid that has the highest
    probability of being the position of the source.

    """

    max_vals = determine_source_position_list(r_ref, l_ref, node_events,
                                              **kwargs)

    positions = np.array([p.to_list() for p, _ in max_vals])
    af = clustering.AffinityPropagation().fit(positions)
    max_prob_centers = determine_peaks(max_vals, af.labels_)

    prob_list = [
        position_probability(p.x, p.y, r_ref, l_ref, node_events)
        for p in max_prob_centers
    ]

    ret_list = [
        Location(p, conf) for p, conf in zip(max_prob_centers, prob_list)
    ]

    return ret_list
示例#15
0
    def _init_model(self, embedding_model=None):
        if embedding_model is None:
            self.load_embeddings_model()
        else:
            self.emb_model = embedding_model

        return cluster.AffinityPropagation(damping=0.9, max_iter=2000, convergence_iter=1000, preference=None,
                                           affinity='precomputed', verbose=True)
示例#16
0
    def getSortedRowClusters(self, objs):
        '''
        Determine row clusters and their order.

        Clusters that create rows are determined by the user-specified
        algorithm. They are then sorted by location, and lists of indices for
        each cluster are returned in order.
        '''
        if self.row_algorithm == 'affinity':
            algorithm = cluster.AffinityPropagation(**self.row_params)
        elif self.row_algorithm == 'DBSCAN':
            algorithm = cluster.DBSCAN(**self.row_params)
        elif self.row_algorithm == 'MeanShift':
            algorithm = cluster.MeanShift(**self.row_params)

        Y = np.array([[y.baseline] for y in objs], dtype=np.float64)
        rows = algorithm.fit_predict(Y)

        if self.row_algorithm == 'affinity':
            # Here, samples are the found location, so just sort directly.
            row_set = set(rows)

            def ordered_clusters():
                # ABBYY coordinates are bottom-to-top, so reverse list.
                for i in sorted(row_set, reverse=True):
                    yield np.where(rows == i)[0]

            return ordered_clusters(), len(row_set), False

        elif self.row_algorithm == 'DBSCAN':
            # Here, samples are labelled, so go back and find the original
            # locations.
            fuzzy = -1 in rows
            num_clusters = len(set(rows)) - (1 if fuzzy else 0)
            clusters = []
            cluster_centres = np.empty(num_clusters)
            for i in range(num_clusters):
                index = np.where(rows == i)
                clusters.append(index[0])
                cluster_centres[i] = np.mean(np.take(Y, index))

            ordered_clusters = (
                clust
                for centre, clust in sorted(zip(cluster_centres, clusters)))
            return ordered_clusters, num_clusters, fuzzy

        elif self.row_algorithm == 'MeanShift':
            # Here, samples are labelled, but cluster locations are provided.
            fuzzy = -1 in rows
            num_clusters = len(set(rows)) - (1 if fuzzy else 0)
            clusters = []
            for i in range(num_clusters):
                index = np.where(rows == i)
                clusters.append(index[0])

            ordered_clusters = (clust for centre, clust in sorted(
                zip(algorithm.cluster_centers_, clusters)))
            return ordered_clusters, num_clusters, fuzzy
示例#17
0
def affinitypropagation(pointarrays, candforpre=None, preference=None):
    ap = cluster.AffinityPropagation()
    if candforpre == None:
        ap.fit(array(pointarrays))
        return ap.labels_, ap.cluster_centers_indices_, None
    else:
        ap.fit(array(candforpre))
        labels = ap.fit_predict(array(pointarrays))
        return labels, None, None
def use_af(mat, n_cluster):
    clusters = cls.AffinityPropagation(damping=0.99282,
                                       affinity='precomputed').fit(mat)
    n_cluster = max(clusters.labels_) + 1
    hist, bin_edges = np.histogram(clusters.labels_,
                                   bins=np.arange(n_cluster + 1))
    print 'Affinity Propagation clustering:', clusters.labels_
    print hist
    return clusters.labels_
示例#19
0
def get_algorithm(algorithm_name: str, clusters: int) -> cluster:
    if algorithm_name == "Birch":
        return cluster.Birch(n_clusters=clusters)
    elif algorithm_name == "Spectral Clustering":
        return cluster.SpectralClustering(n_clusters=clusters)
    elif algorithm_name == 'Affinity Propagation':
        return cluster.AffinityPropagation()
    else:
        raise NotImplementedError(f'algorithm: {algorithm_name} not implemented')
示例#20
0
文件: ace.py 项目: dizzyvn/torch-tcav
 def _cluster(self, acts, method='KM', param_dict=None):
     print('Starting clustering with {} for {} activations'.format(
         method, acts.shape[0]))
     if param_dict is None:
         param_dict = {}
     centers = None
     if method == 'KM':
         n_clusters = param_dict.pop('n_clusters', 25)
         km = cluster.KMeans(n_clusters)
         d = km.fit(acts)
         centers = km.cluster_centers_
         d = np.linalg.norm(np.expand_dims(acts, 1) -
                            np.expand_dims(centers, 0),
                            ord=2,
                            axis=-1)
         asg, cost = np.argmin(d, -1), np.min(d, -1)
     elif method == 'AP':
         damping = param_dict.pop('damping', 0.5)
         ca = cluster.AffinityPropagation(damping)
         ca.fit(acts)
         centers = ca.cluster_centers_
         d = np.linalg.norm(np.expand_dims(acts, 1) -
                            np.expand_dims(centers, 0),
                            ord=2,
                            axis=-1)
         asg, cost = np.argmin(d, -1), np.min(d, -1)
     elif method == 'MS':
         ms = cluster.MeanShift(n_jobs=self.num_workers)
         asg = ms.fit_predict(acts)
     elif method == 'SC':
         n_clusters = param_dict.pop('n_clusters', 25)
         sc = cluster.SpectralClustering(n_clusters=n_clusters,
                                         n_jobs=self.num_workers)
         asg = sc.fit_predict(acts)
     elif method == 'DB':
         eps = param_dict.pop('eps', 0.5)
         min_samples = param_dict.pop('min_samples', 20)
         sc = cluster.DBSCAN(eps, min_samples, n_jobs=self.num_workers)
         asg = sc.fit_predict(acts)
     else:
         raise ValueError('Invalid Clustering Method!')
     if centers is None:  ## If clustering returned cluster centers, use medoids
         centers = np.zeros((asg.max() + 1, acts.shape[1]))
         cost = np.zeros(len(acts))
         for cluster_label in range(asg.max() + 1):
             cluster_idxs = np.where(asg == cluster_label)[0]
             cluster_points = acts[cluster_idxs]
             pw_distances = metrics.euclidean_distances(cluster_points)
             centers[cluster_label] = cluster_points[np.argmin(
                 np.sum(pw_distances, -1))]
             cost[cluster_idxs] = np.linalg.norm(
                 acts[cluster_idxs] -
                 np.expand_dims(centers[cluster_label], 0),
                 ord=2,
                 axis=-1)
     print('Created {} clusters'.format(len(np.unique(asg))))
     return asg, cost, centers
def cluster_query(method):
    # 用来聚类杨老师那边给出的数据
    load_raw_query()
    load_hidden_vector()
    # 检测数量匹配
    if (len(documents) != len(hidden_vectors)):
        print "日志数量与向量数量不符,请检查后重试"
        sys.exit()

    # 接下来是正常处理流程
    print "生成隐藏向量数组"
    t0 = datetime.datetime.now()
    X = np.array([[ele for ele in vector[:-1].split("\t")]
                  for vector in hidden_vectors])
    t1 = datetime.datetime.now()
    print "耗时", t1 - t0

    # print "归一化数据集(特征选择)"
    # # normalized dataset for easier parameter selection
    # t0 = datetime.datetime.now()
    # X = StandardScaler().fit_transform(X)
    # t1 = datetime.datetime.now()
    # print "耗时", t1-t0

    if (method == "kmeans"):
        print "开始 %s 聚类,中心个数 %d" % (method, num_topic)
        algorithm = cluster.MiniBatchKMeans(n_clusters=num_topic)
    elif (method == "ap"):
        print "开始 %s 聚类,中心个数待定" % method
        algorithm = cluster.AffinityPropagation(damping=.5, preference=None)

    t0 = datetime.datetime.now()
    algorithm.fit(X)
    t1 = datetime.datetime.now()
    print "耗时", t1 - t0

    # 输出结果
    print "按照类别写入到结果中"
    y_pred = algorithm.labels_.astype(np.int)

    # 找到类别中的最大值
    maxY = max(y_pred)
    print "类别个数", maxY + 1

    # 各个类别的结果
    topic_result = [[] for i in range(maxY + 1)]
    for i in range(len(documents)):
        topic_result[y_pred[i]].append(documents[i])
    for i in range(len(topic_result)):
        filepath = "%stopic%d.txt" % (kmeans_result_dir, i)
        # print "写入类别 %d 至 %s" % (i, filepath)
        with codecs.open(filepath, "w", "utf-8") as f:
            f.write("类别 %d 的记录个数 %d\n" % (i, len(topic_result[i])))
            for line in topic_result[i]:
                f.write(line)
    print "聚类结果处理完成"
示例#22
0
def clustering(X, algorithm, n_clusters=2):

    X = np.transpose(X)

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=5, include_self=False)

    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # Generate the new colors:
    if algorithm == 'KMeans':
        model = cluster.KMeans(n_clusters=n_clusters, random_state=0)

    elif algorithm == 'Birch':
        model = cluster.Birch(n_clusters=n_clusters)

    elif algorithm == 'DBSCAN':
        model = cluster.DBSCAN(eps=.2)

    elif algorithm == 'AffinityPropagation':
        model = cluster.AffinityPropagation(damping=.9, preference=-200)

    elif algorithm == 'MeanShift':
        model = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

    elif algorithm == 'SpectralClustering':
        model = cluster.SpectralClustering(n_clusters=n_clusters,
                                           eigen_solver='arpack',
                                           affinity="nearest_neighbors")

    elif algorithm == 'Ward':
        model = cluster.AgglomerativeClustering(n_clusters=n_clusters,
                                                linkage='ward',
                                                connectivity=connectivity)

    elif algorithm == 'AgglomerativeClustering':
        model = cluster.AgglomerativeClustering(linkage="average",
                                                affinity="cityblock",
                                                n_clusters=n_clusters,
                                                connectivity=connectivity)

    model.fit(X)

    if hasattr(model, 'labels_'):
        y_pred = model.labels_.astype(np.int)
    else:
        y_pred = model.predict(X)

    return X, y_pred
示例#23
0
def affinitypropagation(words,querys=None, preference=None):
    ap = cluster.AffinityPropagation(0.6)
    if querys == None: 
        ap.fit(array(words))
        return ap.labels_, ap.cluster_centers_indices_,None
    else:
        ap.fit(array(words))
        w_labels = ap.labels_
        labels = ap.fit_predict(array(querys))
        return w_labels,None,labels
示例#24
0
def compute_clusters(vectors, clusters, algorithm='kmeans'):
    # select clustering algorithm
    if algorithm == 'kmeans':
        algorithm = cluster.MiniBatchKMeans(n_clusters=len(set(clusters)))
    elif algorithm == 'dbscan':
        algorithm = cluster.DBSCAN(eps=1.25, n_jobs=-1)
    elif algorithm == 'optics':
        algorithm = cluster.OPTICS(min_samples=10,
                                   eps=10,
                                   cluster_method='dbscan',
                                   n_jobs=-1)
    elif algorithm == 'birch':
        algorithm = cluster.Birch(n_clusters=len(set(clusters)))
    elif algorithm == 'spectral':
        algorithm = cluster.SpectralClustering(n_clusters=len(set(clusters)),
                                               eigen_solver='arpack',
                                               affinity="nearest_neighbors",
                                               n_jobs=-1)
    elif algorithm == 'affinity':
        algorithm = cluster.AffinityPropagation(damping=.9, preference=-200)
    else:
        raise NotImplementedError(f"Not implemented for algorithm {algorithm}")

    # predict cluster memberships
    algorithm.fit(vectors)
    if hasattr(algorithm, 'labels_'):
        labels = algorithm.labels_.astype(np.int)
    else:
        labels = algorithm.predict(vectors)

    #transform categorical labels to digits
    if isinstance(clusters[0], str):
        labels_true = LabelEncoder().fit_transform(clusters)
    elif isinstance(clusters[0], (int, np.int)):
        labels_true = clusters

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    print("Homogeneity: %0.3f" %
          metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f" %
          metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f" %
          metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(vectors, labels))

    return labels, algorithm
def select_n_clusters(data, data_pca, preference_range):
    scores = []
    for preference in preference_range:
        ap = cluster.AffinityPropagation(preference=preference).fit(data_pca)
        score = get_score(data, ap)
        scores.append(score)
    for i, score_function in enumerate(
        ['n_clusters', 'silhouette_score', 'calinski_harabaz_score']):
        plt.subplot(1, 3, i + 1)
        plt.title(score_function)
        plt.plot(preference_range, [item[score_function] for item in scores])
    plt.show()
示例#26
0
    def __init__(self, conn, args, data, split_type, num_clusters):
        """Constructor for Cluster object.

        :param conn: database connection object.
        :param args: dict of arguments read from the arguments file.
        :param data: data to cluster.
        :param split_type: Split train test data randomly or by date to allow testing by specific date ranges.
        :param num_clusters: Number of clusters to create.
        :return: Cluster instance.
        """

        self.conn = conn
        self.args = args
        self.data = data
        self.split_type = split_type

        self.pca_model = None
        self.cluster_model = None
        self.algorithm = args['cluster_algorithm']

        # http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html
        hdbsc = hdbscan.HDBSCAN(min_cluster_size=10)
        affinity_propagation = cluster.AffinityPropagation()
        ms = cluster.MeanShift(bin_seeding=True)
        spectral = cluster.SpectralClustering(n_clusters=num_clusters,
                                              eigen_solver='arpack',
                                              affinity="nearest_neighbors",
                                              random_state=self.args['seed'])
        ward = cluster.AgglomerativeClustering(n_clusters=num_clusters,
                                               linkage='ward')
        birch = cluster.Birch(n_clusters=num_clusters)
        two_means = cluster.MiniBatchKMeans(n_clusters=num_clusters,
                                            random_state=self.args['seed'])
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average", n_clusters=num_clusters)
        hdbsc = hdbscan.HDBSCAN(min_cluster_size=10)
        kmeans = cluster.KMeans(n_clusters=num_clusters,
                                random_state=self.args['seed'])
        dbscan = cluster.DBSCAN()

        self.clustering_algorithms = {
            'MiniBatchKMeans': two_means,
            'AffinityPropagation': affinity_propagation,
            'MeanShift': ms,
            'SpectralClustering': spectral,
            'Ward': ward,
            'AgglomerativeClustering': average_linkage,
            'DBSCAN': dbscan,
            'Birch': birch,
            'HDBSCAN': hdbsc,
            'KMeans': kmeans
        }
示例#27
0
def affinity(fig):
    global X_iris, geo
    ax = fig.add_subplot(geo + 3, projection='3d', title='affinity')
    affinity = cluster.AffinityPropagation(preference=-50)
    affinity.fit(X_iris)
    res = affinity.labels_
    for n, i in enumerate(X_iris):
        ax.scatter(*i[: 3], c='bgrcmyk'[res[n] % 7], marker='o')

    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')
    return res
示例#28
0
 def cluster_recogniser(self, corpus):
     corpus_res = {}
     ngram_vectorizer = skfe.text.CountVectorizer(analyzer='char',
                                                  ngram_range=(2, 4))
     counts = ngram_vectorizer.fit_transform(corpus)
     machine = sc.AffinityPropagation()
     list_num = list(machine.fit_predict(counts))
     groups = [[] for i in range(max(list_num) + 1)]
     for i in range(len(corpus)):
         groups[list_num[i]].append(corpus[i])
     for i in groups:
         corpus_res[i[0]] = i
     return corpus_res
示例#29
0
def AffinityProp(D, pref, damp):
    """
    Perform SKLearn affinity propagation (clustering) with specified data and parameters, returning labels.
    :param pref: preference parameter for the affinity propagation
    :param damp: damping parameter for the affinity propagation
    :return: labels
    """
    aff = cluster.AffinityPropagation(affinity='precomputed',
                                      preference=pref,
                                      damping=damp,
                                      verbose=True)
    labels = aff.fit_predict(D)
    return labels
示例#30
0
def affinity_propagation(threshold, matrix, taxa, revert=False):
    """
    Compute affinity propagation from the matrix.
    """
    if not taxa:
        taxa = list(range(1, len(matrix) + 1))

    # turn distances to similarities
    matrix = np.array(matrix)

    # iterate over matrix
    for i, line in enumerate(matrix):
        matrix[i][i] = 10
        for j in range(i + 1, len(matrix)):
            score = matrix[i][j]
            if score < threshold:
                matrix[i][j] = -np.log2(1 - score**2)  #-np.log2(score+0.01)
                matrix[j][i] = matrix[i][j]  #score ** 2#-np.log2(score+0.01)
            else:
                matrix[i][j] = -score**5  #0.0
                matrix[j][i] = -score**5  # 0.0

    ap = cluster.AffinityPropagation(affinity='precomputed')
    labels = ap.fit_predict(matrix)

    #centers,labels = cluster.affinity_propagation(
    #        matrix,
    #        affinity='precomputed'
    #        )

    # change to our internal cluster style
    idx = max(labels) + 1
    if idx == 0: idx += 1
    for i, c in enumerate(labels):
        if c == -1:
            labels[i] = idx
            idx += 1

    # check for revert
    if revert:
        return dict(zip(range(len(taxa)), labels))

    # return stuff
    clr = {}
    for i, t in enumerate(taxa):
        try:
            clr[labels[i]] += [t]
        except KeyError:
            clr[clusters[i]] = [t]

    return clr