Пример #1
0
         labels=np.full((n,),-1)
         for current in reach_distIds:
             # 正常来说:current的值的值应该比pre的值多一个索引。如果大于一个索引就说明不是一个类别
             if(current-pre!=1):
                 # 类别+1
                 clusterId=clusterId+1
             labels[orders[current]]=clusterId
             pre=current
         return labels
     X = np.array(X)
     orders,reach_dists=OPTICS(X,np.inf,O1)
     plotReachability(reach_dists[orders],1)
     labels=extract_dbscan(X,orders,reach_dists,O2)
     plotFeature(X,labels)
 elif suanfa=="BIRCH":
     labels = Birch(n_clusters = B1, threshold=B2, branching_factor=B3).fit_predict(X)
     plt.scatter(X[:, 0], X[:, 1], c=labels)
     plt.title=("BIRTCH Clusters")
     plt.show()
     print("CH指标:", metrics.calinski_harabaz_score(X, labels))
 elif suanfa=="KMeans":
     kmeans = KMeans(n_clusters=K, max_iter=300, n_init=10, init='k-means++', random_state=0)
     labels = kmeans.fit_predict(X)
     y_kmeans=labels
     for i in range(1,len(labels)):
         if y_kmeans[i] == 0:
             plt.scatter(X[i, 0], X[i, 1], s=15, c='red')
         elif y_kmeans[i] == 1:
             plt.scatter(X[i, 0], X[i, 1], s=15, c='blue')
         elif y_kmeans[i] == 2:
             plt.scatter(X[i, 0], X[i, 1], s=15, c='green')
Пример #2
0
def test_birch_params_validation(params, err_type, err_msg):
    """Check the parameters validation in `Birch`."""
    X, _ = make_blobs(n_samples=80, centers=4)
    with pytest.raises(err_type, match=err_msg):
        Birch(**params).fit(X)
Пример #3
0
    rightlist = Counter((x * y)[x * y != 0]).most_common(min(xdivnum, ydivnum))
    right = sum(np.array(rightlist)[:, 1])
    all = len(x)
    return right / all


DataMat = loadDataSet('five_cluster.txt')
cluster_num = 5
X = DataMat[:, [1, 2]]
g_truth = DataMat[:, 0]
# for 'five_cluser.txt':threshold=1.5,branching_factor=20
# for 'spiral.txt':不适用
# for 'ThreeCircles.txt':不适用
# for 'Twomoons.txt':不适用
t0 = time.time()
y_pred = Birch(n_clusters=cluster_num, threshold=1.5,
               branching_factor=20).fit_predict(X)
t1 = time.time()

plt.subplot(211)
plt.suptitle('Clustering by BIRCH', fontsize=16)
plt.scatter(X[:, 0], X[:, 1], s=2, c=np.transpose(g_truth))
plt.subplot(212)
plt.scatter(X[:, 0], X[:, 1], s=2, c=y_pred)
plt.show()
'''
result_accuracy=Coopcheckdiv(a,b)
print('Accuracy Rate Is:')
print(result_accuracy)
'''
print('Processing Time Is:')
print(t1 - t0)
Пример #4
0
y = df2.Decision_Accept

#K-mean clustering
kmeans = KMeans(n_clusters=2, random_state=10)
kmeans.fit(X)
#kmeans.cluster_centers_
#kmeans.inertia_
labels_km = kmeans.labels_

#Agglomerative clustering
agg = AgglomerativeClustering(n_clusters=2)
agg.fit(X)
labels_agg = agg.labels_

#Birch clustering
bi = Birch(threshold=0.01, n_clusters=2)
bi.fit(X)
labels_bi = bi.labels_

correct_labels_km = sum(y == labels_km)
correct_labels_agg = sum(y == labels_agg)
correct_labels_bi = sum(y == labels_bi)

performance2.loc[len(performance2)] = [
    'kmean', d,
    round(silhouette_score(X, labels_km), 2),
    round((correct_labels_km / y.size), 2)
]
performance2.loc[len(performance2)] = [
    'Agglomerative', d,
    round(silhouette_score(X, labels_agg), 2),
Пример #5
0
def clust_2D_pixels(pixels_df, threshold_cluster=2):
    '''
    Group significant pixels by proximity
    using Birch clustering.

    Parameters
    ----------
    pixels_df : pandas.DataFrame
        a DataFrame of 2 columns, with left-one
        being the column index of a pixel and
        the right-one being row index of a pixel
    threshold_cluster : int
        clustering radius for Birch clustering
        derived from ~40kb radius of clustering
        and bin size.

    
    Returns
    -------
    peak_tmp : pandas.DataFrame
        DataFrame with c_row,c_col,c_label,c_size - 
        columns. row/col are coordinates of centroids,
        label and sizes are unique pixel-cluster labels
        and their corresponding sizes.


    Notes
    -----
    TODO: figure out Birch clustering
    CFNodes etc, check if there might
    be some empty subclusters.
    
    '''
    pixels = pixels_df.values
    pix_idx = pixels_df.index
    # clustering object prepare:
    brc = Birch(n_clusters=None, threshold=threshold_cluster)
    # cluster selected pixels ...
    brc.fit(pixels)
    brc.predict(pixels)
    # array of labels assigned to each pixel
    # after clustering: brc.labels_
    # array of (tuples?) with X,Y coordinates
    # for centroids of corresponding clusters:
    # brc.subcluster_centers_
    uniq_labels, inverse_idx, uniq_counts = np.unique(brc.labels_,
                                                      return_inverse=True,
                                                      return_counts=True)
    # cluster sizes taken to match labels:
    clust_sizes = uniq_counts[inverse_idx]
    ####################
    # After discovering a bug ...
    # bug (or misunderstanding, rather):
    # uniq_labels is a subset of brc.subcluster_labels_
    # TODO: dive deeper into Birch ...
    ####################
    # repeat centroids coordinates
    # as many times as there are pixels
    # in each cluster:
    # IN OTHER WORDS (after bug fix):
    # take centroids corresponding to labels:
    centroids = np.take(brc.subcluster_centers_, brc.labels_, axis=0)

    # small message:
    print("Clustering is completed:\n" +
          "there are {} clusters detected\n".format(uniq_counts.size) +
          "mean size {:.6f}+/-{:.6f}\n".format(uniq_counts.mean(),
                                               uniq_counts.std()) +
          "labels and centroids to be reported.")

    # let's create output DataFrame
    peak_tmp = pd.DataFrame(centroids,
                            index=pix_idx,
                            columns=['c_row', 'c_col'])
    # add labels:
    peak_tmp['c_label'] = brc.labels_.astype(np.int)
    # add cluster sizes:
    peak_tmp['c_size'] = clust_sizes.astype(np.int)

    return peak_tmp
Пример #6
0
def Dbscan():
    weight = joblib.load('result/weight.pkl')
    # print(weight)
    fw = open('result/result.txt', 'a', encoding='utf-8')
    fw.write('Birch')
    fw.write('\n')

    clf = Birch(n_clusters=89)
    time_start = time.time()

    s = clf.fit(weight)

    time_run = time.time() - time_start
    print(s)

    # 每个样本所属的类别
    labels = clf.labels_
    print(labels)
    print(len(set(labels)))

    pred_label = []  # 存储2742个文本的预测标签
    a = {}  # key:类别标签,value:此类的所有文档
    i = 1
    while i <= len(clf.labels_):
        # print(i, clf.labels_[i - 1])  # 第几个文本,对应的类别标签
        pred_label.append(clf.labels_[i - 1])

        if clf.labels_[i - 1] not in a.keys():
            a[clf.labels_[i - 1]] = []
        else:
            a[clf.labels_[i - 1]].append(i)

        i = i + 1
    print(a)
    print('pred_lable:', pred_label)

    true_lable = []
    file = open('data/Tweets_cluster.txt', 'r')
    for line in file:
        line = line.strip('\n')
        true_lable.append(int(line))
    print('true_lable:', true_lable)

    # 性能评估:NMI(标准化互信息)
    nmi = metrics.normalized_mutual_info_score(true_lable, pred_label)
    print('NMI值为:%f' % (nmi))  # 结果越相似NMI值应接近1;算法结果很差则NMI值接近0
    print('运行时间:', time_run)

    fw.write('\t')
    fw.write('NMI值为:' + str(nmi) + ' ' + '运行时间:' + str(time_run))

    localtime = time.localtime(
        time.time())  # time.localtime()方法,作用是格式化时间戳为本地的时间
    time_format = time.strftime('%Y-%m-%d %H:%M:%S', localtime)  # 格式化制定形式
    fw.write(' ' + '本地时间:' + time_format)
    fw.write('\n')
    fw.close()

    ####  降维 ####
    pca = PCA(n_components=2)  # 降维两维
    print('pca:', pca)
    new_weight = pca.fit_transform(weight)  # 重新计算成二维形式
    print('newWeight:', new_weight)

    # 绘制散点图(scatter),横轴为x,获取的第1列数据;纵轴为y,获取的第2列数据;c=y_pred对聚类的预测结果画出散点图,marker='o'说明用点表示图形。
    plt.scatter(new_weight[:, 0], new_weight[:, 1], c=pred_label, marker='o')
    plt.title('Birch')
    plt.show()
Пример #7
0
    def get_plot(self, parameters):
        qe = QueryExecutor()
        query = self.query.format(
            start=parameters['range'].value[0],
            end=parameters['range'].value[1],
        )
        df = qe.get_result_dataframe(query)

        countries_to_leave = Utils().get_valid_fips_countries(25000)
        df = df[(df['ActorGeo'].isin(countries_to_leave))]

        multi_index = pd.MultiIndex.from_product(
            [
                df['ActorGeo'].unique(), df['Type1'].unique(),
                df['Type2'].unique()
            ],
            names=['ActorGeo', 'Type1', 'Type2'])
        df.index = pd.MultiIndex.from_arrays(
            [df['ActorGeo'], df['Type1'], df['Type2']],
            names=['ActorGeo', 'Type1', 'Type2'])
        df.drop(['ActorGeo', 'Type1', 'Type2'], axis=1, inplace=True)
        df = df.reindex(multi_index).reset_index()
        df.fillna(0., inplace=True)
        df.sort_values(['ActorGeo', 'Type1', 'Type2'], inplace=True)
        df.index = np.arange(df.shape[0])
        df = df[(df.Type1 != 0.0) & df.Type2 != 0.0]

        types_no = np.unique(df.groupby('ActorGeo')['AvgTone'].count())[0]
        data = df['AvgTone'].values.reshape((-1, types_no))
        labels = df['ActorGeo'].unique()
        norm_data = (data - np.mean(data, axis=1)[:, np.newaxis]) / np.std(
            data, axis=1)[:, np.newaxis]

        n_clusters = parameters['n_clusters'].value

        if parameters['method'].value == 'agglomerative':
            model = AgglomerativeClustering(
                n_clusters=n_clusters,
                affinity='precomputed',
                linkage='complete',
                compute_full_tree=True,
            )
        elif parameters['method'].value == 'britch':
            model = Birch(branching_factor=5, n_clusters=n_clusters)
        elif parameters['method'].value == 'kmeans':
            model = KMeans(n_clusters=n_clusters)
        elif parameters['method'].value == 'affinity_prop':
            model = AffinityPropagation()

        clusters = model.fit_predict(norm_data)
        cluster_df = pd.DataFrame({
            'country_id': labels,
            'cluster_id': clusters
        })

        cluster_df = cluster_df.join(
            Utils().get_fips_iso_mapping(),
            on=['country_id'],
            how='right',
        ).fillna(-1)
        cluster_df.rename({'ISO': 'country_iso'}, axis=1, inplace=True)
        cluster_df['country_name'] = cluster_df['country_id'].map(
            Utils().get_fips_country_id_to_name_mapping())
        # Uncomment to write result to csv
        # cluster_df.groupby('cluster_id')['country_name'].apply(lambda countries: '; '.join(countries)).to_csv('clustering_result.csv')

        fig = px.choropleth(
            cluster_df,
            locations='country_iso',
            locationmode='ISO-3',
            color='cluster_id',
            hover_name='country_name',
            hover_data=['cluster_id'],
            labels={
                'country_name': 'Country Name',
                'cluster_id': 'Cluster ID'
            },
            color_continuous_scale=px.colors.qualitative.Alphabet,
        )
        return plot(fig, include_plotlyjs=True, output_type='div')
Пример #8
0
 def birch(self):
     #acc is 0.87
     self.model = Birch(n_clusters=self.n_clusters)
Пример #9
0
def Birch_Cluster(np_data, num_clusters):
    """
    Perform birch clustering and return labels
    """
    birch = Birch()
    return birch.fit_predict(np_data, num_clusters)
Пример #10
0
def combination_algorithm(AMDs_train, energy_train, AMDs_test, energy_test,
                          type):
    NUMBER_OF_CLUSTER = 5
    if type == "kmeans_com":
        model = KMeans(n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    elif type == "affinity_com":
        model = AffinityPropagation(damping=0.9,
                                    random_state=5).fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    elif type == "agglomerative_com":
        model = AgglomerativeClustering(n_clusters=NUMBER_OF_CLUSTER)
        y_clusters = model.fit_predict(AMDs_test)
    elif type == "birch_com":
        model = Birch(threshold=0.1,
                      n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    elif type == "minibatch_com":
        model = MiniBatchKMeans(n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    elif type == "meanshift_com":
        model = MeanShift().fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    else:
        return

    new_energy = []
    new_energy_test = []
    for i in range(NUMBER_OF_CLUSTER):
        if i not in y_clusters:
            print("ERROR: ", i, " is not here")
            continue
        index = 0
        temp_AMDs = []
        temp_energy = []
        for j in model.labels_:
            if i == j:
                temp_AMDs.append(AMDs_train[index])
                temp_energy.append(energy_train[index])
            index += 1

        index = 0
        temp_AMDs_test = []
        temp_energy_test = []
        for j in y_clusters:
            if i == j:
                temp_AMDs_test.append(AMDs_test[index])
                temp_energy_test.append(energy_test[index])
            index += 1

        quadratic_featurizer = PolynomialFeatures(degree=1,
                                                  interaction_only=True)
        X_train_quadratic = quadratic_featurizer.fit_transform(temp_AMDs)
        X_test_quadratic = quadratic_featurizer.fit_transform(temp_AMDs_test)
        model2 = LinearRegression()
        model2.fit(X_train_quadratic, temp_energy)

        temp_energy_pred = model2.predict(X_test_quadratic)

        new_energy.extend(temp_energy_pred)
        new_energy_test.extend(temp_energy_test)

        fig, ax = plt.subplots()
        ax.scatter(temp_energy_test, temp_energy_pred)
        ax.plot([np.min(temp_energy_test),
                 np.max(temp_energy_test)],
                [np.min(temp_energy_test),
                 np.max(temp_energy_test)],
                'k--',
                lw=4)
        ax.set_xlabel('Given')
        ax.set_ylabel('Predicted')
        plt.savefig('./image/combination_algorithm' + str(i) + '.jpg')

    fig, ax = plt.subplots()
    print("R^2 score of the combination algorithm is: ",
          r2_score(new_energy_test, new_energy))
    print("RMSE of the combination algorithm is: ",
          math.sqrt(mean_squared_error(new_energy_test, new_energy)))
    ax.scatter(new_energy_test, new_energy)
    ax.plot([np.min(new_energy_test),
             np.max(new_energy_test)],
            [np.min(new_energy_test),
             np.max(new_energy_test)],
            'k--',
            lw=4)
    ax.set_xlabel('Given')
    ax.set_ylabel('Predicted')
    plt.savefig('./image/combination_algorithm.jpg')
Пример #11
0
    if cluster_method == 'KMeans':
        cluster_algo = KMeans(n_clusters=num_clusters)
    elif cluster_method == 'AffinityPropagation':
        cluster_algo = AffinityPropagation()
    elif cluster_method == 'MeanShift':
        cluster_algo = MeanShift()
    elif cluster_method == 'SpectralClustering':
        cluster_algo = SpectralClustering(n_clusters=num_clusters)
    elif cluster_method == 'AgglomerativeClustering':
        cluster_algo = AgglomerativeClustering(n_clusters=num_clusters)
    elif cluster_method == 'DBSCAN':
        cluster_algo = DBSCAN()
    elif cluster_method == 'OPTICS':
        cluster_algo = OPTICS()
    elif cluster_method == 'GaussianMixture':
        cluster_algo = GaussianMixture()
    elif cluster_method == 'Birch':
        cluster_algo = Birch(n_clusters=num_clusters)

    data = pd.read_csv(data_path, index_col=None)
    columns = pd.read_csv(columns_path, index_col=None)
    columns = columns['Columns'].values
    columns = np.append(['MSA', 'msa name'], columns).copy()
    data_2 = data[columns]

    cluster_out = cluster_algo.fit_predict(data_2.iloc[:, 2:])
    result = pd.DataFrame(data[['MSA', 'msa name']])
    result['cluster_number'] = cluster_out
    result.to_csv('clusters.csv')
Пример #12
0
    plt.figure()

    # convert sequence to array
    docvecs = []
    for num in range(len(model.docvecs)):
        # print(num)
        # print(model.docvecs[num])
        docvecs.append(np.array(model.docvecs[num]))

    for branching in Parameter.branching_factor:
        silhouette_scores = []
        calinski_scores = []

        for thres in Parameter.threshold:
            Birch_model = Birch(branching_factor=branching,
                                n_clusters=None,
                                threshold=thres,
                                compute_labels=True).fit(docvecs)
            labels = Birch_model.labels_

            silhouette_scores.append(metrics.silhouette_score(docvecs, labels))
            calinski_scores.append(
                metrics.calinski_harabaz_score(docvecs, labels))

        plt.subplot(1, 2, 1)
        plt.plot(Parameter.threshold, silhouette_scores, label=str(branching))
        plt.legend()
        plt.title("silhouette_scores")

        plt.subplot(1, 2, 2)
        plt.plot(Parameter.threshold, calinski_scores, label=str(branching))
        plt.legend()
Пример #13
0
def birch(tfidf_matrix):
    b_cluster = Birch(n_clusters=90, threshold=0.7)
    result = b_cluster.fit_predict(tfidf_matrix)
    return result
print 'Kmeans'
t0 = tl.time()
kmeans = KMeans(n_clusters=6, random_state=0).fit(X_train)
t1 = tl.time()
print 'Training Time', round(t1 - t0, 3), 's'
t0 = tl.time()
pred = kmeans.predict(
    X_test[:10000, :])  #Predicting for one dataspace with 10000 records
t1 = tl.time()
print 'Validation Time', round(t1 - t0, 3), 's'
print "-----------*----------------"
from sklearn.cluster import Birch
print 'Birch'
t0 = tl.time()
birch = Birch(branching_factor=50,
              n_clusters=None,
              threshold=0.5,
              compute_labels=True).fit(X_train)
t1 = tl.time()
print 'Training Time', round(t1 - t0, 3), 's'
t0 = tl.time()
pred = birch.predict(
    X_test[:10000, :])  #Predicting for one dataspace with 10000 records
t1 = tl.time()
print 'Validation Time', round(t1 - t0, 3), 's'
print "-----------*----------------"
from sklearn import mixture
print 'Gaussian'
t0 = tl.time()
gmm = mixture.GaussianMixture(n_components=5,
                              covariance_type='full').fit(X_train)
t1 = tl.time()
Пример #15
0
 def __init__(self,
              similarity='cosine',
              decay_window=20,
              decay_alpha=0.25,
              clustering='dbscan',
              tagger='twitter',
              useful_tags=[
                  'Noun', 'Verb', 'Adjective', 'Determiner', 'Adverb',
                  'Conjunction', 'Josa', 'PreEomi', 'Eomi', 'Suffix',
                  'Alpha', 'Number'
              ],
              delimiters=['. ', '\n', '.\n'],
              min_token_length=2,
              stopwords=stopwords_ko,
              no_below_word_count=2,
              no_above_word_portion=0.85,
              max_dictionary_size=None,
              min_cluster_size=2,
              similarity_threshold=0.85,
              matrix_smoothing=False,
              n_clusters=None,
              compactify=True,
              **kwargs):
     self.decay_window = decay_window
     self.decay_alpha = decay_alpha
     if similarity == 'cosine':  # very, very slow :(
         self.vectorizer = DictVectorizer()
         self.uniform_sim = self._sim_cosine
     elif similarity == 'jaccard':
         self.uniform_sim = self._sim_jaccard
     elif similarity == 'normalized_cooccurrence':
         self.uniform_sim = self._sim_normalized_cooccurrence
     else:
         raise LexRankError(
             "available similarity functions are: cosine, jaccard, normalized_cooccurrence"
         )
     self.sim = lambda sentence1, sentence2: self.decay(
         sentence1, sentence2) * self.uniform_sim(sentence1, sentence2)
     self.factory = SentenceFactory(tagger=tagger,
                                    useful_tags=useful_tags,
                                    delimiters=delimiters,
                                    min_token_length=min_token_length,
                                    stopwords=stopwords,
                                    **kwargs)
     if clustering == 'birch':
         self._birch = Birch(threshold=0.99, n_clusters=n_clusters)
         self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix
                                                                  )
     elif clustering == 'dbscan':
         self._dbscan = DBSCAN()
         self._clusterer = lambda matrix: self._dbscan.fit_predict(1 -
                                                                   matrix)
     elif clustering == 'affinity':
         self._affinity = AffinityPropagation()
         self._clusterer = lambda matrix: self._affinity.fit_predict(1 -
                                                                     matrix)
     elif clustering is None:
         self._clusterer = lambda matrix: [
             0 for index in range(matrix.shape[0])
         ]
     else:
         raise LexRankError(
             "available clustering algorithms are: birch, markov, no-clustering(use `None`)"
         )
     self.no_below_word_count = no_below_word_count
     self.no_above_word_portion = no_above_word_portion
     self.max_dictionary_size = max_dictionary_size
     self.similarity_threshold = similarity_threshold
     self.min_cluster_size = min_cluster_size
     self.matrix_smoothing = matrix_smoothing
     self.compactify = compactify
Пример #16
0
def main(args=None):

    args = parse_arguments().parse_args(args)
    # if args.threads <= 4:
    #     log.error('')
    #     exit(1)
    outputFolder = os.path.dirname(os.path.abspath(args.outFileName)) + '/'

    raw_file_name = os.path.splitext(os.path.basename(args.outFileName))[0]

    if args.numberOfNearestNeighbors is None:
        cooler_obj = cooler.Cooler(args.matrix)
        args.numberOfNearestNeighbors = int(cooler_obj.info['ncells'])
    if args.cell_coloring_type:
        cell_name_cell_type_dict = {}

        cell_type_color_dict = {}
        color_cell_type_dict = {}
        cell_type_counter = 0
        with open(args.cell_coloring_type, 'r') as file:
            for i, line in enumerate(file.readlines()):
                line = line.strip()
                try:
                    cell_name, cell_type = line.split('\t')
                except Exception:
                    cell_name, cell_type = line.split('    ')
                cell_name_cell_type_dict[cell_name] = cell_type
                if cell_type not in cell_type_color_dict:
                    cell_type_color_dict[cell_type] = cell_type_counter
                    color_cell_type_dict[cell_type_counter] = cell_type
                    cell_type_counter += 1

    if args.cell_coloring_batch:
        cell_name_cell_type_dict_batch = {}

        cell_type_color_dict_batch = {}
        color_cell_type_dict_batch = {}
        cell_type_counter_batch = 0
        with open(args.cell_coloring_batch, 'r') as file:
            for i, line in enumerate(file.readlines()):
                line = line.strip()
                try:
                    cell_name, cell_type = line.split('\t')
                except Exception:
                    cell_name, cell_type = line.split('    ')
                cell_name_cell_type_dict_batch[cell_name] = cell_type
                if cell_type not in cell_type_color_dict_batch:
                    cell_type_color_dict_batch[cell_type] = cell_type_counter_batch
                    color_cell_type_dict_batch[cell_type_counter_batch] = cell_type
                    cell_type_counter_batch += 1

    if args.clusterMethod == 'spectral':
        cluster_object = SpectralClustering(n_clusters=args.numberOfClusters, affinity='nearest_neighbors', n_jobs=args.threads, random_state=0)
    elif args.clusterMethod == 'kmeans':
        cluster_object = KMeans(n_clusters=args.numberOfClusters, random_state=0, n_jobs=args.threads, precompute_distances=True)
    elif args.clusterMethod.startswith('agglomerative'):
        for linkage in ['ward', 'complete', 'average', 'single']:
            if linkage in args.clusterMethod:
                cluster_object = AgglomerativeClustering(n_clusters=args.numberOfClusters, linkage=linkage)
                break
    elif args.clusterMethod == 'birch':
        cluster_object = Birch(n_clusters=args.numberOfClusters)
    else:
        log.error('No valid cluster method given: {}'.format(args.clusterMethod))

    umap_params_dict = {}

    if not args.noUMAP:
        for param in vars(args):
            if 'umap_' in param:
                umap_params_dict[param] = vars(args)[param]
        umap_params_dict['umap_random'] = 42
    # log.debug(umap_params_dict)

    if args.saveMemory:
        matrices_list = cell_name_list(args.matrix)
        max_nnz = 0
        for matrix in matrices_list:
            cooler_obj = cooler.Cooler(args.matrix + '::' + matrix)
            nnz = cooler_obj.info['nnz']
            if max_nnz < nnz:
                max_nnz = nnz
        minHash_object = None
        matricesPerRun = int(len(matrices_list) * args.shareOfMatrixToBeTransferred)
        if matricesPerRun < 1:
            matricesPerRun = 1
        chromosome_indices = None
        if args.intraChromosomalContactsOnly:
            cooler_obj = cooler.Cooler(args.matrix + '::' + matrices_list[0])
            binsDataFrame = cooler_obj.bins()[:]
            chromosome_indices = {}
            for chromosome in cooler_obj.chromnames:
                chromosome_indices[chromosome] = np.array(binsDataFrame.index[binsDataFrame['chrom'] == chromosome].tolist())

        for j, i in enumerate(range(0, len(matrices_list), matricesPerRun)):
            if i < len(matrices_list) - 1:
                matrices_share = matrices_list[i:i + matricesPerRun]
            else:
                matrices_share = matrices_list[i:]
            neighborhood_matrix, matrices_list_share = open_and_store_matrix(args.matrix, matrices_share, 0, len(matrices_share),
                                                                             args.chromosomes, args.intraChromosomalContactsOnly, chromosome_indices)
            if minHash_object is None:
                minHash_object = MinHash(n_neighbors=args.numberOfNearestNeighbors, number_of_hash_functions=args.numberOfHashFunctions, number_of_cores=args.threads,
                                         shingle_size=0, fast=args.euclideanModeMinHash, maxFeatures=int(max_nnz), absolute_numbers=False)

            if j == 0:
                minHash_object.fit(neighborhood_matrix)
            else:
                minHash_object.partial_fit(X=neighborhood_matrix)

        precomputed_graph = minHash_object.kneighbors_graph(mode='distance')
        precomputed_graph = np.nan_to_num(precomputed_graph)
        precomputed_graph.data[np.isinf(precomputed_graph.data)] = 0
        if not args.noPCA:

            pca = PCA(n_components=min(precomputed_graph.shape) - 1)
            precomputed_graph = np.nan_to_num(precomputed_graph.todense())
            precomputed_graph[np.isinf(precomputed_graph)] = 0
            precomputed_graph = pca.fit_transform(precomputed_graph)

            if args.dimensionsPCA:
                args.dimensionsPCA = min(args.dimensionsPCA, precomputed_graph.shape[0])
                precomputed_graph = precomputed_graph[:, :args.dimensionsPCA]
                # cluster_object.fit(precomputed_graph[:, :args.dimensionsPCA])
        if not args.noUMAP:

            if umap_params_dict is None:
                reducer = umap.UMAP()
            else:
                reducer = umap.UMAP(n_neighbors=umap_params_dict['umap_n_neighbors'], n_components=umap_params_dict['umap_n_components'], metric=umap_params_dict['umap_metric'],
                                    n_epochs=umap_params_dict['umap_n_epochs'],
                                    learning_rate=umap_params_dict['umap_learning_rate'], init=umap_params_dict['umap_init'], min_dist=umap_params_dict['umap_min_dist'], spread=umap_params_dict['umap_spread'],
                                    set_op_mix_ratio=umap_params_dict['umap_set_op_mix_ratio'], local_connectivity=umap_params_dict['umap_local_connectivity'],
                                    repulsion_strength=umap_params_dict['umap_repulsion_strength'], negative_sample_rate=umap_params_dict['umap_negative_sample_rate'], transform_queue_size=umap_params_dict['umap_transform_queue_size'],
                                    a=umap_params_dict['umap_a'], b=umap_params_dict['umap_b'], angular_rp_forest=umap_params_dict['umap_angular_rp_forest'],
                                    target_n_neighbors=umap_params_dict['umap_target_n_neighbors'], target_metric=umap_params_dict['umap_target_metric'],
                                    target_weight=umap_params_dict['umap_target_weight'], random_state=umap_params_dict['umap_random'],
                                    force_approximation_algorithm=umap_params_dict['umap_force_approximation_algorithm'], verbose=umap_params_dict['umap_verbose'], unique=umap_params_dict['umap_unique'])
            precomputed_graph = reducer.fit_transform(precomputed_graph)
        precomputed_graph = np.nan_to_num(precomputed_graph)
        precomputed_graph[np.isinf(precomputed_graph)] = 0

        try:
            cluster_object.fit(precomputed_graph)
        except Exception:
            cluster_object.fit(precomputed_graph.todense())

        minHashClustering = MinHashClustering(minHashObject=minHash_object, clusteringObject=cluster_object)
        minHashClustering._precomputed_graph = precomputed_graph

    else:
        neighborhood_matrix, matrices_list = create_csr_matrix_all_cells(args.matrix, args.threads, args.chromosomes, outputFolder, raw_file_name, args.intraChromosomalContactsOnly, pDistance=args.distance)

        if args.saveIntermediateRawMatrix:
            save_npz(args.saveIntermediateRawMatrix, neighborhood_matrix)

    if not args.saveMemory:
        minHash_object = MinHash(n_neighbors=args.numberOfNearestNeighbors, number_of_hash_functions=args.numberOfHashFunctions, number_of_cores=args.threads,
                                 shingle_size=5, fast=args.euclideanModeMinHash, maxFeatures=int(max(neighborhood_matrix.getnnz(1))), absolute_numbers=False, max_bin_size=100000,
                                 minimal_blocks_in_common=100, excess_factor=1, prune_inverse_index=False)
        minHashClustering = MinHashClustering(minHashObject=minHash_object, clusteringObject=cluster_object)
        minHashClustering.fit(X=neighborhood_matrix, pSaveMemory=args.shareOfMatrixToBeTransferred, pPca=(not args.noPCA), pPcaDimensions=args.dimensionsPCA, pUmap=(not args.noUMAP), pUmapDict=umap_params_dict)

    if args.noPCA and args.noUMAP:
        mask = np.isnan(minHashClustering._precomputed_graph.data)
        minHashClustering._precomputed_graph.data[mask] = 0

        mask = np.isinf(minHashClustering._precomputed_graph.data)
        minHashClustering._precomputed_graph.data[mask] = 0

    labels_clustering = minHashClustering.predict(minHashClustering._precomputed_graph, pPca=args.noPCA, pPcaDimensions=args.dimensionsPCA)

    if args.createScatterPlot:
        if args.noPCA and args.noUMAP:
            pca = PCA(n_components=min(minHashClustering._precomputed_graph.shape) - 1)
            neighborhood_matrix_knn = pca.fit_transform(minHashClustering._precomputed_graph.todense())
        else:
            neighborhood_matrix_knn = minHashClustering._precomputed_graph

        list(set(labels_clustering))

        colors = process_cmap(args.colorMap)

        try:
            neighborhood_matrix_knn = neighborhood_matrix_knn.toarray()
        except Exception:
            pass

        label_x = 'PC1'
        label_y = 'PC2'
        if not (args.noUMAP):
            label_x = 'UMAP1'
            label_y = 'UMAP2'
        if args.cell_coloring_type:
            if len(colors) < len(cell_type_color_dict):
                log.error('The chosen colormap offers too less values for the number of clusters.')
                exit(1)
            labels_clustering_cell_type = []
            for cell_name in matrices_list:
                labels_clustering_cell_type.append(cell_type_color_dict[cell_name_cell_type_dict[cell_name]])

            labels_clustering_cell_type = np.array(labels_clustering_cell_type)

            log.debug('labels_clustering_cell_type: {}'.format(len(labels_clustering_cell_type)))
            log.debug('matrices_list: {}'.format(len(matrices_list)))

            plt.figure(figsize=(args.figuresize[0], args.figuresize[1]))
            for i, color in enumerate(colors[:len(cell_type_color_dict)]):
                mask = labels_clustering_cell_type == i
                log.debug('plot cluster: {} {}'.format(color_cell_type_dict[i], np.sum(mask)))
                plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(color_cell_type_dict[i]), s=20, alpha=0.7)

            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize)
            plt.xticks([])
            plt.yticks([])
            plt.xlabel(label_x, fontsize=args.fontsize)
            plt.ylabel(label_y, fontsize=args.fontsize)
            if '.' not in args.createScatterPlot:
                args.createScatterPlot += '.png'
            scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '_cell_color.' + args.createScatterPlot.split('.')[-1]
            plt.tight_layout()
            plt.savefig(scatter_plot_name, dpi=args.dpi)
            plt.close()

            # compute overlap of cell_type find found clusters
            computed_clusters = set(labels_clustering)
            cell_type_amounts_dict = {}
            percentage_threshold = 0.8
            if args.latexTable:

                for threshold in [0.7, 0.8, 0.9]:
                    cell_type_amounts_dict[threshold] = {}
                with open(args.latexTable, 'w') as matches_file:
                    header = '\\begin{table}[!htb]\n\\footnotesize\n\\begin{tabular}{|l'
                    body = '\\hline Cluster '
                    for i in range(len(color_cell_type_dict)):
                        mask_cell_type = labels_clustering_cell_type == i
                        header += '|c'
                        body += '& ' + str(color_cell_type_dict[i]) + ' (' + str(np.sum(mask_cell_type)) + ' cells)'
                    header += '|}\n'
                    body += '\\\\\n'
                    # body = ''
                    for i in computed_clusters:
                        body += '\\hline Cluster ' + str(i)
                        mask_computed_clusters = labels_clustering == i
                        body += ' (' + str(np.sum(mask_computed_clusters)) + ' cells)'
                        for j in range(len(cell_type_color_dict)):
                            mask_cell_type = labels_clustering_cell_type == j
                            mask = mask_computed_clusters & mask_cell_type
                            number_of_matches = np.sum(mask)
                            body += '& ' + str(number_of_matches)

                            if number_of_matches != 1:
                                body += ' cells / '
                            else:
                                body += ' cell / '

                            body += '{:.2f}'.format((number_of_matches / np.sum(mask_computed_clusters)) * 100) + ' \\% '
                            for threshold in [0.7, 0.8, 0.9]:

                                if number_of_matches / np.sum(mask_computed_clusters) >= threshold:
                                    if color_cell_type_dict[j] in cell_type_amounts_dict[threshold]:
                                        cell_type_amounts_dict[threshold][color_cell_type_dict[j]] += number_of_matches
                                    else:
                                        cell_type_amounts_dict[threshold][color_cell_type_dict[j]] = number_of_matches
                                else:
                                    if color_cell_type_dict[j] in cell_type_amounts_dict[threshold]:
                                        continue
                                    else:
                                        cell_type_amounts_dict[threshold][color_cell_type_dict[j]] = 0
                        body += '\\\\\n'
                    body += '\\hline ' + '&' * len(cell_type_color_dict) + '\\\\\n'

                    for threshold in [0.7, 0.8, 0.9]:
                        body += '\\hline Correct identified $>{}\\%$'.format(int(threshold * 100))
                        for i in range(len(cell_type_color_dict)):
                            mask_cell_type = labels_clustering_cell_type == i

                            if color_cell_type_dict[i] in cell_type_amounts_dict[threshold]:
                                body += '& ' + str(cell_type_amounts_dict[threshold][color_cell_type_dict[i]]) + ' / ' + str(np.sum(mask_cell_type)) + ' ('
                                body += '{:.2f}'.format((cell_type_amounts_dict[threshold][color_cell_type_dict[i]] / np.sum(mask_cell_type)) * 100)
                            else:
                                body += '& ' + str(0) + ' / ' + str(np.sum(mask_cell_type)) + ' ('
                                body += '{:.2f}'.format(0 / np.sum(mask_cell_type))

                            body += ' \\%)'
                        body += '\\\\\n'
                    body += '\\hline \n'
                    body += '\\end{tabular}\n\\caption{}\n\\end{table}'

                    matches_file.write(header)
                    matches_file.write(body)
            else:
                with open('matches.txt', 'w') as matches_file:
                    for i in computed_clusters:
                        mask_computed_clusters = labels_clustering == i
                        for j in range(len(cell_type_color_dict)):
                            mask_cell_type = labels_clustering_cell_type == j

                            mask = mask_computed_clusters & mask_cell_type

                            number_of_matches = np.sum(mask)
                            matches_file.write('Computed cluster {} (size: {}) matching with cell type {} (size: {}) {} times. Rate (matches/computed_clusters): {}%\n'.format(
                                i, np.sum(mask_computed_clusters), color_cell_type_dict[j], np.sum(mask_cell_type), number_of_matches, number_of_matches / np.sum(mask_computed_clusters)))

                            if number_of_matches / np.sum(mask_computed_clusters) >= percentage_threshold:
                                if color_cell_type_dict[j] in cell_type_amounts_dict:
                                    cell_type_amounts_dict[color_cell_type_dict[j]] += number_of_matches
                                else:
                                    cell_type_amounts_dict[color_cell_type_dict[j]] = number_of_matches

                        matches_file.write('\n')
            all_detected = 0
            all_possible = 0
            for i in range(len(cell_type_color_dict)):

                mask_cell_type = labels_clustering_cell_type == i
                all_possible += np.sum(mask_cell_type)
                if color_cell_type_dict[i] in cell_type_amounts_dict:
                    all_detected += cell_type_amounts_dict[color_cell_type_dict[i]]
                    cell_type_amounts_dict[color_cell_type_dict[i]] /= np.sum(mask_cell_type)
                else:
                    cell_type_amounts_dict[color_cell_type_dict[i]] = 0.0
            correct_associated = 0.0
            for cell_iterator in cell_type_color_dict:
                correct_associated += cell_type_amounts_dict[cell_iterator]

            correct_associated /= len(cell_type_amounts_dict)

            # all_detected /= all_possible

            # correct_associated = ((correct_associated*4) + (all_detected)) / 5
            # correct_associated = correct_associated

            with open('correct_associated', 'w') as file:
                file.write(str(correct_associated))
        if args.cell_coloring_batch:
            if len(colors) < len(cell_type_color_dict_batch):
                log.error('The chosen colormap offers too less values for the number of clusters.')
                exit(1)
            labels_clustering_cell_type_batch = []
            for cell_name in matrices_list:
                labels_clustering_cell_type_batch.append(cell_type_color_dict_batch[cell_name_cell_type_dict_batch[cell_name]])

            labels_clustering_cell_type_batch = np.array(labels_clustering_cell_type_batch)

            log.debug('labels_clustering_cell_type: {}'.format(len(labels_clustering_cell_type_batch)))
            log.debug('matrices_list: {}'.format(len(matrices_list)))

            plt.figure(figsize=(args.figuresize[0], args.figuresize[1]))
            for i, color in enumerate(colors[:len(cell_type_color_dict_batch)]):
                mask = labels_clustering_cell_type_batch == i
                log.debug('plot cluster: {} {}'.format(color_cell_type_dict_batch[i], np.sum(mask)))
                plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(color_cell_type_dict_batch[i]), s=20, alpha=0.7)

            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize)
            plt.xticks([])
            plt.yticks([])
            plt.xlabel(label_x, fontsize=args.fontsize)
            plt.ylabel(label_y, fontsize=args.fontsize)
            if '.' not in args.createScatterPlot:
                args.createScatterPlot += '.png'
            scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '_cell_color_batch.' + args.createScatterPlot.split('.')[-1]
            plt.tight_layout()
            plt.savefig(scatter_plot_name, dpi=args.dpi)
            plt.close()

        plt.figure(figsize=(args.figuresize[0], args.figuresize[1]))
        for i, color in enumerate(colors[:args.numberOfClusters]):
            mask = labels_clustering == i
            plt.scatter(neighborhood_matrix_knn[:, 0].T[mask], neighborhood_matrix_knn[:, 1].T[mask], color=color, label=str(i), s=20, alpha=0.7)
        plt.legend(fontsize=args.fontsize)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=args.fontsize)

        plt.xticks([])
        plt.yticks([])
        plt.xlabel(label_x, fontsize=args.fontsize)
        plt.ylabel(label_y, fontsize=args.fontsize)
        if '.' not in args.createScatterPlot:
            args.createScatterPlot += '.png'
        scatter_plot_name = '.'.join(args.createScatterPlot.split('.')[:-1]) + '.' + args.createScatterPlot.split('.')[-1]
        plt.tight_layout()
        plt.savefig(scatter_plot_name, dpi=args.dpi)
        plt.close()

    matrices_cluster = list(zip(matrices_list, labels_clustering))
    np.savetxt(args.outFileName, matrices_cluster, fmt="%s")
Пример #17
0
newdata = pca.fit_transform(tfidf_matrix.toarray())


# 进行kmeans聚类
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
start = time.time()
result = km.fit_predict(newdata)
end = time.time()
print("k_means运行的时间是:", end-start)
plt.scatter(newdata[:, 0], newdata[:, 1], c=result)
plt.show()


# 进行dbscan聚类
start = time.time()
db = DBSCAN(eps=0.03, min_samples=30).fit_predict(newdata)
end = time.time()
print("DBscan运行的时间是:", end-start)
plt.scatter(newdata[:, 0], newdata[:, 1], c=db)
plt.show()


# 进行birch聚类
start = time.time()
result_birch = Birch(n_clusters=5).fit_predict(newdata)
end = time.time()
print("birch运行的时间是:", end-start)
plt.scatter(newdata[:, 0], newdata[:, 1], c=result_birch)
plt.show()
Пример #18
0
def cluster_birch(dataset):

	estimator = Birch(branching_factor=5, threshold=0.5, n_clusters=10, compute_labels=True).fit(dataset)

	infer_results(estimator.predict(dataset), "BIRCH")
def do_birch(ft, nc):
    return Birch(n_clusters=nc).fit(ft).labels_
Пример #20
0
import pickle
from sklearn.cluster import Birch, AffinityPropagation
import os

t = 'keypoint'

with open('../data/' + t + '_all_bicubic_float.dat', 'rb') as ff:
    data_all = pickle.load(ff)

if not os.path.exists('../data/partQuanBirch/' + t +
                      '/sp1_16_bicubic_float_c5'):
    os.mkdir('../data/partQuanBirch/' + t + '/sp1_16_bicubic_float_c5')

#os.mkdir('../data/partQuanBirch/'+t+'/SuperPixle_1')
for i in range(4096):
    gd = data_all[:, i]

    data = gd.reshape(-1, 1)

    brc = Birch(branching_factor=50,
                n_clusters=5,
                threshold=0.01,
                compute_labels=True)
    brc.fit(data)
    labels = brc.predict(data)
    np.save(
        '../data/partQuanBirch/' + t + '/sp1_16_bicubic_float_c5/' + str(i) +
        '_labels_sp1.npy', labels)

    print(str(i) + ' has been done')
Пример #21
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import Birch
from sklearn import metrics

from sklearn.datasets.samples_generator import make_blobs
# X为样本特征,Y为样本簇类别, 共1000个样本,每个样本2个特征,共4个簇,簇中心在[-1,-1], [0,0],[1,1], [2,2]
X, y = make_blobs(n_samples=1000, n_features=2, centers=[[-1,-1], [0,0], [1,1], [2,2]], cluster_std=[0.4, 0.3, 0.4, 0.3],random_state =9)
plt.scatter(X[:, 0], X[:, 1], marker='o',c=y)
plt.show()


# 不设置聚类数目的Birch
y_pred = Birch(n_clusters = None).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.show()
print("CH指标:", metrics.calinski_harabaz_score(X, y_pred))


# 设置聚类数目的Birch
y_pred = Birch(n_clusters = 4).fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.show()
print("CH指标:", metrics.calinski_harabaz_score(X, y_pred))


# 尝试多个threshold取值,和多个branching_factor取值
param_grid = {'threshold':[0.5,0.3,0.1],'branching_factor':[50,20,10]}  # 定义优化参数字典,字典中的key值必须是分类算法的函数的参数名
for threshold in param_grid['threshold']:
    for branching_factor in param_grid['branching_factor']:
        clf = Birch(n_clusters = 4,threshold=threshold,branching_factor=branching_factor)
def main():
    # txtTojson()

    # nltk.download('stopwords')
    stopwords = nltk.corpus.stopwords.words('english')

    with open('/Users/mac/Desktop/DM/Tweets.json', 'r') as TweetsFile:
        content = json.load(
            TweetsFile)  #  [{'text': "...", 'cluster': "..."}, {...}, ..]
        tweets = {}  # {0: "...", 1: "...", ...}
        clusters = {}  # {0: 37, 1: 40, ...}    # len = 89

        for i in range(len(content)):
            tweets.update({i: content[i]['text']})
            clusters.update({i: content[i]['cluster']})
            # if content[i]['cluster'] in clusters.keys():
            #     clusters[content[i]['cluster']].append(i)
            # else:
            #     clusters[content[i]['cluster']] = []

    cluster_num = max(list(clusters.values()))  # 110
    tweet_num = len(list(tweets.values()))  #  2472

    # 创建字典
    vocab_stem = []
    vocab_tokenized = []

    for i in tweets:
        tokens = tokenize_and_stem(tweets[i])
        vocab_tokenized.append(tokens)

    # TF-IDF  将文本转为TF-IDF矩阵。 首先计算文档中的词频,转换为词频矩阵TF;IDF逆文档频率,在某些文档中出现高频但是在语料库中低频的具有较高权重
    tfidf_vectorizer = TfidfVectorizer(
        stop_words='english',
        use_idf=True,
        tokenizer=tokenize_and_stem,
        # ngram_range=(1, 3)
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(list(tweets.values()))
    # print(tfidf_matrix.shape)    # have ngram_range=(1, 3): (2472, 4448)  not have (2472, 29160)

    # terms是TF-IDF矩阵使用的特征列表,使用TF-IDF矩阵可以运行一系列聚类算法
    terms = tfidf_vectorizer.get_feature_names()

    # dist = 1-cosine_similarity
    dist = 1 - cosine_similarity(tfidf_matrix)

    # -----------------------------------------KMeans------------------------------------------------------------
    # 使用预定数量的clusters初始化,每个文档分配给一个簇,最小化聚类内的平方和,计算聚类的平均值并将其用作新的聚类质心,重新分配,迭代直到收敛
    # 需要多次运行以全局最优,KMeans不易达到全局最优
    num_clusters = 89
    km = KMeans(n_clusters=num_clusters).fit(tfidf_matrix)

    km_pre = km.labels_.tolist()

    # print(km.labels_[100:110])  [75 25 18 85 86 19 88 86  3 37]

    # km_result = km.fit_predict(tfidf_matrix)
    # print(km_result)

    labels_true = []
    labels_pred = []
    for i in clusters:
        labels_true.append(clusters[i])
    labels_true = sorted(labels_true)

    for i in km_pre:
        labels_pred.append(km_pre[i])
    labels_pred = sorted(labels_pred)

    km_score = metrics.normalized_mutual_info_score(labels_true, labels_pred)
    print('KMeans NMI: ', km_score)  # 0.7629953372

    X = tfidf_matrix.toarray()
    ms = MeanShift()
    ms_pre = ms.fit_predict(X)
    ms_pre = sorted(ms_pre)
    ms_score = metrics.normalized_mutual_info_score(labels_true, ms_pre)
    print('MeanShift NMI: ', ms_score)  # 0.7056324482

    # -----------------------------------------------------Affinity Propagation----------------------------------------
    ap = AffinityPropagation().fit(tfidf_matrix)
    ap_pre = ap.fit_predict(tfidf_matrix)  #  [195 272 206 ..., 213 137 109]
    ap_pre = sorted(ap_pre)

    ap_score = metrics.normalized_mutual_info_score(labels_true, ap_pre)
    print('AffinityPropagation NMI: ', ap_score)  #  0.775145369374

    # --------------------------------------------------Spectral Clustering---------------------------------------------
    spc = SpectralClustering().fit(tfidf_matrix)
    # spc_pre = spc.fit_predict(tfidf_matrix)
    spc_pre = spc.labels_.tolist()
    spc_pre = sorted(spc_pre)

    spc_score = metrics.normalized_mutual_info_score(labels_true, spc_pre)
    print('SpectralClustering NMI: ', spc_score)  # 0.47384412442

    # -------------------------------------------------Ward Hierarchical clustering-------------------------------------
    ward_hc = AgglomerativeClustering(n_clusters=89, linkage='ward')
    X = tfidf_matrix.toarray()
    ward_hc.fit(X)
    ward_hc_pre = ward_hc.labels_.tolist()

    ward_hc_pre = sorted(ward_hc_pre)
    ward_hc_score = metrics.normalized_mutual_info_score(
        labels_true, ward_hc_pre)
    print('Ward Hierarchical clustering NMI: ',
          ward_hc_score)  #  0.759773200943

    # ------------------------------------------------- AgglomerativeClustering-----------------------------------------
    hc = AgglomerativeClustering(n_clusters=89)
    X = tfidf_matrix.toarray()
    hc.fit(X)
    hc_pre = hc.labels_.tolist()

    hc_pre = sorted(hc_pre)
    hc_score = metrics.normalized_mutual_info_score(labels_true, hc_pre)
    print('AgglomerativeClustering NMI: ', hc_score)  #  0.759773200943

    # ----------------------------------------DBSCAN-------------------------------b------------------------------
    X = tfidf_matrix.toarray()
    dbscan_pre = DBSCAN().fit_predict(X)
    dbscan_pre = sorted(dbscan_pre)
    dbscan_score = metrics.normalized_mutual_info_score(
        labels_true, dbscan_pre)
    print('DBSCAN NMI: ', dbscan_score)  #  0.155256389516

    # -------------------------------------------Gaussian mixture models------------------------------------------
    gm = GaussianMixture(n_components=89)
    X = tfidf_matrix.toarray()
    gm.fit(X)
    gm_pre = gm.predict(X)
    gm_pre = sorted(gm_pre)

    gm_score = metrics.normalized_mutual_info_score(labels_true, gm_pre)
    print('Gaussian mixture models NMI: ', gm_score)  # 0.816899648742

    # --------------------------------------------Birch------------------------------------------------------------
    birch = Birch(n_clusters=89)
    X = tfidf_matrix.toarray()
    # birch.fit(X)
    # birch_pre = birch.labels_.tolist()
    birch_pre = birch.fit_predict(X)
    birch_pre = sorted(birch_pre)

    birch_score = metrics.normalized_mutual_info_score(labels_true, birch_pre)
    print('Birch NMI: ', birch_score)  # 0.780857693264
def runClusterer(clusterer_name,params,data,param_scale='',metricstring=''):
    #print('S2 runClusterer>>>')
    from time import time

    #----------------------------------s1 读取数据
    #如果data[0]存储的是字符串,则读出data[0],data[1],即训练数据和标签位置
    if isinstance(data[0],str):
        X,y,size = loadPictureData(data[0],data[1],data[2])
        SX = X
    #如果存储的不是字符串,那就是直接能用的向量,直接存储就行,各分量自动存储
    else:
        X,SX,y,size = data
    #print('S2 data load done')
    #----------------------------------s2 参数缩放 
    # params: (5,10,) param_scale: (1,100,)
    # ture params : (5,0.1,)
    # 建议meanshift ,dbsacan eps /10 
    if param_scale != '':
        params = list(params)
        for i in range(0,len(params)):
            params[i] /= param_scale[i]

    #s2 选择聚类器
    #kmeans 需指定k
    if clusterer_name == 'kmeans':
        from sklearn.cluster import KMeans
        clusterer = KMeans(init='k-means++', n_clusters=int(params[0]), n_init=10)
        ms = 'sc'
    elif clusterer_name == 'dbscan':        
        from sklearn.cluster import DBSCAN
        # 0.5,10 注意!! eps 被缩小一个尺度!!!
        clusterer = DBSCAN(eps=params[0], min_samples=params[1])
        ms = 'sc'
    #birch 需指定k
    elif clusterer_name == 'birch':
        # None,0.5,50
        from sklearn.cluster import Birch
        clusterer = Birch(n_clusters = params[0], threshold = params[1], branching_factor = params[2])
        ms = 'sc'
    #optics 
    elif clusterer_name == 'optics':
        from sklearn.cluster import OPTICS
        clusterer = OPTICS(min_samples=int(params[0]))#,xi=params[1],min_cluster_size=params[2])
        #OPTICS(min_samples = 10, xi = 0.05, min_cluster_size = 0.05)
        ms = 'sc'
    #Spectral 需指定k
    elif clusterer_name == 'spectral':
        pass
        #clusterer = SpectralClustering(n_clusters = params[0], assign_labels = params[1], random_state = params[2])
    elif clusterer_name == 'hierarch':
        from sklearn.cluster import AgglomerativeClustering
        #clusterer = AgglomerativeClustering(n_clusters=params[0],affinity=params[1],linkage=params[2])#'canberra',linkage='complete')
        clusterer = AgglomerativeClustering(n_clusters=int(params[0]), affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='average')#, distance_threshold=None)
        ms = 'sc'
    elif clusterer_name == 'meanshift':
        from sklearn.cluster import MeanShift,estimate_bandwidth
        #0.2,500
        bandwidth = estimate_bandwidth(X, quantile=params[0], n_samples=params[1])
        clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True) 
        ms = 'sc'
    else:
        print('no cluster name specify')
        import sys
        sys.exit(0)

    if metricstring == '':
        metricstring = ms
    #s3 正式运行聚类
    t0 = time()
    clusterer.fit(X)
    t1 = time()
    
    infoDict = {'clusterer':clusterer,'clusterer_name':clusterer_name,'params':params,'metricstring':metricstring}
    # 聚类器,聚类器生成字符串,度量列表字符串
    dataDict = {'X':X,'SX':SX,'y':y,'size':size}
    # 存储数据的字典,三样全
    performanceDict = {'time':t1-t0,'clusters_num':max(clusterer.labels_)+1}
    # 存储表现的字典,先存储时间和聚类数量
    clusterer_container = {'info':infoDict ,'data':dataDict,'performance':performanceDict}    
    #print('S4 done.<<<')
    return clusterer_container
Пример #24
0
    ax.legend()

    plt.show()

    # Perform a K-Means clustering
    km = KMeans(n_clusters=nb_clusters, random_state=1000)
    Y_pred_km = km.fit_predict(X)

    print('Adjusted Rand score: {}'.format(adjusted_rand_score(Y, Y_pred_km)))

    # Perform the online clustering
    mbkm = MiniBatchKMeans(n_clusters=nb_clusters,
                           batch_size=batch_size,
                           reassignment_ratio=0.001,
                           random_state=1000)
    birch = Birch(n_clusters=nb_clusters, threshold=0.2, branching_factor=350)

    scores_mbkm = []
    scores_birch = []

    for i in range(0, nb_samples, batch_size):
        X_batch, Y_batch = X[i:i + batch_size], Y[i:i + batch_size]

        mbkm.partial_fit(X_batch)
        birch.partial_fit(X_batch)

        scores_mbkm.append(
            adjusted_rand_score(Y[:i + batch_size],
                                mbkm.predict(X[:i + batch_size])))
        scores_birch.append(
            adjusted_rand_score(Y[:i + batch_size],
Пример #25
0
def test_birch_n_clusters_long_int():
    # Check that birch supports n_clusters with np.int64 dtype, for instance
    # coming from np.arange. #16484
    X, _ = make_blobs(random_state=0)
    n_clusters = np.int64(5)
    Birch(n_clusters=n_clusters).fit(X)
Пример #26
0
def test_subcluster_dtype(global_dtype):
    X = make_blobs(n_samples=80, n_features=4,
                   random_state=0)[0].astype(global_dtype, copy=False)
    brc = Birch(n_clusters=4)
    assert brc.fit(X).subcluster_centers_.dtype == global_dtype
Пример #27
0
    for row in csv_reader:
        if line_count == 0:
            print(f'Column names are {", ".join(row)}')
            line_count += 1
        else:
            print(row[1], row[2])
            words.append(row[2])
            etichette.append(row[1])
            line_count += 1

    print(f'Processed {line_count} lines.')

# Create word embeddings
word_embeddings = c2v_model.vectorize_words(words)
print(word_embeddings)

brc = Birch(n_clusters=n_cluster)
brc.fit(word_embeddings)
labels = brc.predict(word_embeddings)

print(labels)

generateCSV(labels, etichette)

#pca(labels)

sys.argv = ['./readClusterDataCopia.py', n_cluster, embedding, 'BIRCH']

exec(open("./readClusterDataCopia.py").read())

print("eseguito")
## 产生模拟数据
xx = np.linspace(-22, 22, 10)
yy = np.linspace(-22, 22, 10)
xx, yy = np.meshgrid(xx, yy)
n_centres = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:,
                                                                 np.newaxis]))
#产生10万条特征属性是2,类别是100,符合高斯分布的数据集
X, y = make_blobs(n_samples=100000,
                  n_features=2,
                  centers=n_centres,
                  random_state=28)

#创建不同的参数(簇直径)Birch层次聚类
birch_models = [
    Birch(threshold=1.7, n_clusters=None),  #运行的函数
    Birch(threshold=0.5, n_clusters=None),
    Birch(threshold=1.7, n_clusters=100)
]
#threshold:簇直径的阈值,    branching_factor:大叶子个数

#我们也可以加参数来试一下效果,比如加入分支因子branching_factor,给定不同的参数值,看聚类的结果
## 画图
final_step = [
    u'直径=1.7;n_lusters=None', u'直径=0.5;n_clusters=None',
    u'直径=1.7;n_lusters=100'
]

plt.figure(figsize=(12, 8), facecolor='w')
plt.subplots_adjust(left=0.02, right=0.98, bottom=0.1, top=0.9)
colors_ = cycle(colors.cnames.keys())
Пример #29
0
labels = ms.labels_
X["Cluster3"] = labels
print(pd.crosstab(X["Cluster3"], X["Target"]))

from sklearn.cluster import MeanShift, estimate_bandwidth

bandwidth = estimate_bandwidth(_X, quantile=0.05, n_samples=300, n_jobs=-1)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(_X)
labels = ms.labels_
X["Cluster4"] = labels
print(pd.crosstab(X["Cluster4"], X["Target"]))

from sklearn.cluster import Birch

birch = Birch(n_clusters=40)
labels = birch.fit_predict(_X)
X["Cluster5"] = labels
print(pd.crosstab(X["Cluster5"], X["Target"]))

__X = X[[
    "Starid", "Cluster1", "Cluster2", "Cluster3", "Cluster4", "Cluster5",
    "Target"
]]
__X["K"] = [
    f"{k1}-{k2}-{k3}-{k4}-{k5}" for k1, k2, k3, k4, k5 in zip(
        __X["Cluster1"], __X["Cluster2"], __X["Cluster3"], __X["Cluster4"],
        __X["Cluster5"])
]
rule = pd.crosstab(__X["K"], __X["Target"])
print(rule.head(10))
Пример #30
0
import matplotlib.colors as colors

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import Birch
from sklearn.datasets.samples_generator import make_blobs

mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False

xx = np.linspace(-22, 22, 10)
yy = np.linspace(-22, 22, 10)
xx, yy = np.meshgrid(xx, yy)

n_centers = np.hstack((np.ravel(xx)[:, np.newaxis], np.ravel(yy)[:, np.newaxis]))
X, y = make_blobs(n_samples= 1000, n_features=2, centers=n_centers, random_state=28)
birch_models = [Birch(threshold=1.7, n_clusters=None), Birch(threshold=0.5, n_clusters=None), Birch(threshold=1.7, n_clusters=100)]

final_step = [u'直径=1.7;n_lusters=None',u'直径=0.5;n_clusters=None',u'直径=1.7;n_lusters=100']

plt.figure(figsize=(12, 8), facecolor='w')
plt.subplots_adjust(left=0.02, right=0.98, bottom=0.1, top=0.9)
colors_ = cycle(colors.cnames.keys())
cm = mpl.colors.ListedColormap(colors.cnames.keys())

for ind, (birch_model, info) in enumerate(zip(birch_models, final_step)):
    t = time()
    birch_model.fit(X)
    time_ = time() - t
    
    labels = birch_model.labels_
    centroids = birch_model.subcluster_centers_