def get_result_clustering_by_ap(self, text_list):
        cluster = Cluster()
        Xn, text_vector_list = cluster.get_text_data(text_list)

        x_sims, max_sim, min_sim, mid_sim = cluster.get_data_sims(Xn)
        # 两两相似度矩阵
        print('x_sims:', x_sims)

        text_clustering_list = []
        if min_sim > max_clusters_sim:
            for i in range(len(text_vector_list)):
                text_id = text_vector_list[i].get("text_id")
                text_clustering_list.append({
                    "text_id": text_id,
                    "class_num": "0"
                })
        else:
            ap = AP(
                damping=apdampling,
                max_iter=1000,
                convergence_iter=100,
                preference=mid_sim,  # 相似度达到多少聚为一类
                affinity="precomputed").fit(x_sims)
            # labels:聚类结果
            labels = ap.labels_
            monitor_set = set()
            if -1 not in labels:
                for i in range(len(labels)):
                    monitor_set.add(labels[i])
                    class_num = str(labels[i])
                    text_id = text_vector_list[i].get("text_id")
                    text_clustering_list.append({
                        "text_id": text_id,
                        "class_num": class_num
                    })
                return text_clustering_list
            else:
                adapt_ap_damping = 0.5
                ap = AP(max_iter=1000,
                        convergence_iter=100,
                        preference=mid_sim,
                        affinity="precomputed",
                        damping=adapt_ap_damping).fit(x_sims)
                for i in range(len(labels)):
                    monitor_set.add(labels[i])
                    class_num = str(labels[i])
                    text_id = text_vector_list[i].get("text_id")
                    text_clustering_list.append({
                        "text_id": text_id,
                        "class_num": class_num
                    })
                return text_clustering_list
示例#2
0
def make_cluster_map(damping=0.992):
    test_labels, prediction = pickle.load(open(f_path_pred, 'rb'))
    prob_conf = np.zeros((121, 121))
    for l in range(121):
        inds = np.squeeze(np.array(np.where(test_labels == l)))
        class_conf = prediction[inds, :].mean(axis=0)
        prob_conf[l, :] = class_conf
    F = prob_conf
    D = (1 - F)
    np.fill_diagonal(D, 0)
    D_p = 0.5 * (D + D.T)

    clst = AP(
        damping=damping,  # damping determines # of clusters
        max_iter=500,
        convergence_iter=15,
        affinity='euclidean',
        verbose=False)
    clst.fit(D_p)
    print 'Number of cluster:', len(clst.cluster_centers_)
    membership = np.c_[range(121), clst.labels_]

    fine_to_coarse = dict(membership)
    coarse_to_fine = {l: [] for l in clst.labels_}
    for k, v in fine_to_coarse.items():
        coarse_to_fine[v].append(k)

    pickle.dump(coarse_to_fine,
                open(os.path.join(curdir, 'coarse_to_fine.p'), 'wb'))
    pickle.dump(fine_to_coarse,
                open(os.path.join(curdir, 'fine_to_coarse.p'), 'wb'))
示例#3
0
    def get_fitted_affinity(self, user_to_rwf, distance):
        aff_prop = AP(affinity='precomputed')

        similarity_mtx = np.zeros((len(user_to_rwf), len(user_to_rwf)))
        if distance == 'cosine':
            i = 0
            for user1 in user_to_rwf:
                j = 0
                for user2 in user_to_rwf:
                    similarity_mtx[i][j] = cosine_sim(user_to_rwf[user1],
                                                      user_to_rwf[user2])
                    j += 1
                i += 1
        else:
            i = 0
            for user1 in user_to_rwf:
                j = 0
                for user2 in user_to_rwf:
                    similarity_mtx[i][j] = word_overlap(
                        user_to_rwf[user1], user_to_rwf[user2])
                    j += 1
                i += 1

        median = np.median(similarity_mtx)

        for i in range(len(user_to_rwf)):
            similarity_mtx[i][i] = median

        return aff_prop.fit(similarity_mtx)
示例#4
0
    def create_stratum(self, column_names, **kwargs):
        '''
        Use affinity propagation to find number of strata for each column. 
        column_names is a list of the covariates to be split into strata and 
        used for classification. This funciton adds a column to the data frame
        for each column as column_name_strata that gives the strata designation
        for that variable.  The whole data frame is returned.
        '''

        for colname in column_names:
            X = self.data[colname].reshape(-1, 1)

            if np.isnan(X).any():
                raise ValueError(
                    "There are NaN values in self.data[%s] that the \
                                  clustering algorithm can't handle" % colname)

            elif np.unique(self.data[colname]).shape[0] <= 2:
                string_name = colname + '_strata'
                self.data[string_name] = self.data[colname].astype(int)

            else:
                af_model = AP(damping=0.9)
                strata_groups = af_model.fit(X)

                #cluster_centers_indices = af.cluster_centers_indices_
                #n_clusters_ = len(cluster_centers_indices)

                string_name = colname + '_strata'
                self.data[string_name] = strata_groups.labels_

        return self.data
示例#5
0
def ap_cluster_and_plot(row_number, row):
    filename = str(row_number) + "-" + str(row[0]) + "-color" + ".png"
    #matrix = np.matrix(np.array(row[1:])).reshape(28,28)
    #plot(filename, matrix)
    data_entry = row_to_data(row[1:])
    af = AP(verbose=True, damping=0.5, max_iter=1000).fit(data_entry)
    al = af.labels_
    ac = af.cluster_centers_indices_
    print "Row:",row_number," Digit:",row[0],"Clusters: ", len(ac), ":", len(al)
    plot_color(filename, data_entry, al)
def propagacion_de_afinidad(T):
    model = AP(
        affinity='euclidean'
    )  #lo unico que determinamos nosotros es la afinidad, si la afinidad es 'precomputed' debemos introducir en fit nuestra matrix de preferencias en lugar de T
    model.fit(
        T)  #preparamos la matriz de distancias y la obtencion de centroides
    targets = model.predict(
        T
    )  #encontramos la posicion estable de todos los datos en sus respectivos clusters
    return targets
示例#7
0
def find_number_of_sources(cosine_distance):
    cos_dist = np.resize(cosine_distance,
                         new_shape=(len(cosine_distance),
                                    len(cosine_distance)))
    ap = AP(affinity='precomputed').fit(cos_dist)
    counter = Counter(ap.labels_).most_common()
    source = 0
    for i in range(len(counter)):
        if counter[i][1] == counter[0][1]:
            source += 1
    return source
示例#8
0
    def __affinityPropagation(self):
        print("Affinity Propagation Clustering on PCA-reduced data")
        reduced_data = PCA(n_components=2).fit_transform(self.learn[2])
        af = AP().fit(reduced_data)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_

        #print the image file names in the terminal
        for obj in self.images:
            for key, value in obj.items():
                if (value[key] in cluster_centers_indices):
                    print("cp " + key + " ../APClustersImagesFile/")

        self.affPropResult = af
        self.affPropData = reduced_data

        n_clusters_ = len(cluster_centers_indices)

        plt.figure(1)
        plt.clf()

        colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
        for k, col in zip(range(n_clusters_), colors):
            class_members = labels == k
            cluster_center = reduced_data[cluster_centers_indices[k]]
            plt.plot(reduced_data[class_members, 0],
                     reduced_data[class_members, 1], col + '.')
            plt.plot(cluster_center[0],
                     cluster_center[1],
                     'o',
                     markerfacecolor=col,
                     markeredgecolor='k',
                     markersize=14)
            for x in reduced_data[class_members]:
                plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]],
                         col)

        plt.title(
            'Affinity Propagation clustering on the 9 dataset (PCA-reduced data)'
        )
        path = self.path + '/ClusteringImages/AffinityPropagation/Number9.png'
        plt.savefig(path)
import scipy

random.seed(1234)
HOLDING_PERIOD = 20
d = mld.getData()
X, y, FeatureNames, returns, actual_y = mld.createMLData(HOLDING_PERIOD)

clusters = [
    "KMeans", "GMM", "SpectralClustering", "AffinityProp", "PCA", "ICA", "NMF"
]
n = 8
clustering = np.array([
    KMeans(n_clusters=n, max_iter=500),
    GMM(n_components=n, n_iter=1000, n_init=15, random_state=1),
    SpectralClustering(n_clusters=n),
    AP(damping=0.5, max_iter=200),
])
num = 10
reduction = np.array([
    PCA(n_components=num),
    FastICA(n_components=num),
    NMF(n_components=num, max_iter=500),
    FA(n_components=num, max_iter=500)
])

# Define classifier
class_id = 0
red_id = 3
clf = clustering[class_id]
red = reduction[red_id]
示例#10
0
    logging.info("RMSD calculation time is %s" % t)
    t = dt.datetime.now()

    logging.info("Starting clusterization")
    #Now clusterization routine
    from sklearn.cluster import AffinityPropagation as AP
    #
    #S = np.memmap('aff.raw', dtype='float32', shape=(ln, ln), mode='write')
    ##Prepare values
    #S = -1 * matrix ** 2
    ##Calc average for logfile
    #d_avg = np.mean(cl_matrix)
    #
    #
    ##Convert all values to float
    af = AP(affinity="precomputed", max_iter=2000,
            convergence_iter=50).fit(cl_matrix)
    cluster_centers_indices = af.cluster_centers_indices_
    labels = af.labels_
    n_clusters_ = len(cluster_centers_indices)

    with open('aff.res', 'w') as out:
        for i in range(ln):
            out.write("%s\t%d\n" % (pdb_list[i], labels[i]))
    with open('aff.ref', 'w') as out:
        for i in cluster_centers_indices:
            out.write('%s\n' % pdb_list[i])

    logging.info("Info about run:")
    #logging.info('%s: %s' % ("Average distance", d_avg))
    logging.info('%s: %d' % ("Number of clusters", n_clusters_))
    logging.info('%s\t%s' % ("Center of cluster", "Number of cluster"))