Пример #1
0
def test_transform_match_across_dtypes():
    X, _ = make_blobs(n_samples=80, n_features=4, random_state=0)
    brc = Birch(n_clusters=4)
    Y_64 = brc.fit_transform(X)
    Y_32 = brc.fit_transform(X.astype(np.float32))

    assert_allclose(Y_64, Y_32, atol=1e-6)
Пример #2
0
def getClusters(dt_all, cols_cat):
    # cols
    #    cols_encode_label = dt_all.filter(regex = "Encode_Label").columns.values.tolist()
    cols_tsne = [
        'X118', 'X127', 'X47', 'X315', 'X311', 'X179', 'X314', 'X232', 'X29',
        'X232', 'X261'
    ]

    # standardize
    dt_all_norm = StandardScaler().fit_transform(dt_all[cols_tsne])

    n_comp_tnse = 2

    # tsne

    tsne = TSNE(random_state=2016, perplexity=50, verbose=2)
    tsne_result = tsne.fit_transform(dt_all_norm)
    dt_tsne = pd.DataFrame({"x1": tsne_result[:, 0], "x2": tsne_result[:, 1]})
    dt_tsne = StandardScaler().fit_transform(dt_tsne)

    # mds
    mds = MDS(n_components=n_comp_tnse, random_state=888)
    mds_result = mds.fit_transform(dt_all_norm)

    # Birch
    n_clusters_birch = 2
    birch = Birch(n_clusters=n_clusters_birch)
    birch_result = birch.fit_transform(dt_all_norm)

    # kmeans
    kmeans = KMeans(n_clusters=4, random_state=0).fit(dt_tsne)

    # DBSCAN
    dbscan = DBSCAN(eps=0.196, min_samples=100).fit(dt_tsne)

    # Append decomposition components to datasets
    for i in range(1, n_comp_tnse + 1):
        dt_all['CL_TSNE_' + str(i)] = tsne_result[:, i - 1]
        dt_all['CL_MDS_' + str(i)] = mds_result[:, i - 1]

    for i in range(1, n_clusters_birch + 1):
        dt_all['CL_BIRCH_' + str(i)] = birch_result[:, i - 1]

    for i in np.unique(kmeans.labels_):
        x = kmeans.labels_ == i
        x = x.astype("int64")
        dt_all['CL_Kmeans_' + str(i)] = x

    for i in np.unique(dbscan.labels_):
        x = dbscan.labels_ == i
        x = x.astype("int64")
        dt_all['CL_DBSCAN_' + str(i)] = x

    return (dt_all)
Пример #3
0
def birch_clustering(values, branching_factor=50, threshold=0.5):
    """
    Clusters input using the birch algorithm
    :param values:
    :type values:
    :param branching_factor:
    :type branching_factor: int
    :param threshold: treshold, default=0.5 this is very high
    :type threshold: int
    :return: return list[labels, centroids, class, fitted class]
    :rtype: list
    """
    birchc = Birch(branching_factor=branching_factor,
                   n_clusters=None,
                   threshold=threshold,
                   compute_labels=True)
    x_new = birchc.fit_transform(values)
    labels = birchc.labels_
    subc_centroids = birchc.subcluster_centers_
    return [labels, subc_centroids, birchc, x_new]
Пример #4
0
def build_model(df, cluster_type="kmeans", seed=1):
    if cluster_type == "birch":
        model = Birch(n_clusters=N_CLUSTERS)
        res = model.fit_predict(df)
    elif cluster_type == "minibatch":
        model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
    elif cluster_type == "em":
        model = mixture.GMM(n_components=N_CLUSTERS)
        model.fit(df)
        res = model.predict(df)
    elif cluster_type == 'lda':
        model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed)
        data_to_cluster = np.array(df).astype(int)
        lda_res = model.fit_transform(data_to_cluster)
        res = []
        for i in lda_res:  #for now - do hard clustering, take the higheset propability
            res.append(i.argmax())
    else:
        model = KMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
        df_array = np.array(df)

        dis_dict = {}
        for i in range(N_CLUSTERS):
            dis_dict[i] = clusters_centers[i]
        all_dist = []
        for line_idx in range(len(df_array)):
            label =  model.labels_[line_idx]
            dist = calc_distance(df_array[line_idx],dis_dict[label])
            all_dist.append(dist)
        df["distance_from_cluster"] = all_dist

    #clusters = model.labels_.tolist()
    #print ("clusters are:",clusters)
    print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res)))
    res = [str(i) for i in res]
    docs_clusteres = zip(df.index,res)
    return docs_clusteres
Пример #5
0
#df_new.fillna(df_new.mean())
#X = StandardScaler().fit_transform(df_new)

# In[59]:

# Compute DBSCAN
#db = DBSCAN(eps=.8, min_samples=10).fit(X)
#normalize the data
min_max_scalar = preprocessing.MinMaxScaler()
x_scaled = min_max_scalar.fit_transform(df_new)
df_norm = pd.DataFrame(x_scaled)
db = Birch(branching_factor=50,
           n_clusters=5,
           threshold=25,
           compute_labels=True)
db.fit_transform(df_new)
#core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
cluster_centers = db.subcluster_centers_

# In[60]:

import matplotlib.pyplot as plt

cluster_centers = db.subcluster_centers_
labels = db.labels_
fig = plt.figure(figsize=(15, 15), dpi=200)
ax = fig.add_subplot(111)
ax.set_title("Cluter centers on 2 Component PCA in Disease Dataset on Birch")
for x, y, lab in zip(cluster_centers[:, 0], cluster_centers[:, 1], labels):
Пример #6
0
def qmrf_regions(data,
                 edges,
                 nbow=20,
                 lamda=1,
                 sampling='random',
                 nsamples=10000,
                 label_potential='l1',
                 unary_sq=True,
                 online=True,
                 gamma=None,
                 max_iter=5,
                 truncated=False,
                 rng=42,
                 verbose=True,
                 return_centers=False,
                 return_edge_costs=True):
    with Timer('Colors'):
        if nbow == 'birch':
            clf = Birch(threshold=0.8, branching_factor=100)
        elif online:
            clf = MiniBatchKMeans(n_clusters=nbow,
                                  verbose=verbose,
                                  random_state=rng,
                                  batch_size=100,
                                  max_iter=100,
                                  max_no_improvement=10)
        else:
            clf = KMeans(n_clusters=nbow, verbose=verbose, random_state=rng)

        if nsamples is None:
            dist = clf.fit_transform(data)
        else:
            if sampling == 'random':
                idx = np.random.choice(data.shape[0], nsamples, replace=False)
            else:
                n = np.sqrt(nsamples)
                ratio = image.shape[0] / float(image.shape[1])
                ny = int(n * ratio)
                nx = int(n / ratio)
                y = np.linspace(0, image.shape[0], ny,
                                endpoint=False) + (image.shape[0] // ny // 2)
                x = np.linspace(0, image.shape[1], nx,
                                endpoint=False) + (image.shape[1] // nx // 2)
                xx, yy = np.meshgrid(x, y)
                idx = np.round(yy * image.shape[1] + xx).astype(int).flatten()
            clf.fit(data[idx])
            dist = clf.transform(data)

        if nbow == 'birch':
            centers = clf.subcluster_centers_
        else:
            centers = clf.cluster_centers_

    with Timer('Unary'):
        K = centers.shape[0]

        if label_potential == 'color':
            unary_cost = np.zeros((data.shape[0], centers.shape[0]),
                                  np.float32)
            for i in range(centers.shape[0]):
                unary_cost[:, i] = colordiff(data, centers[i:i + 1])
        else:
            unary_cost = dist.astype(np.float32)

        if unary_sq:
            unary_cost **= 2

    with Timer('Pairwise'):
        if label_potential == 'l1':
            label_cost = np.abs(centers[:, None, :] -
                                centers[None, ...]).sum(-1)
        elif label_potential == 'l2':
            label_cost = np.sqrt(
                ((centers[:, None, :] - centers[None, ...])**2).sum(-1))
        elif label_potential == 'potts':
            label_cost = np.ones((K, K), int) - np.eye(K, dtype=int)
        elif label_potential == 'color':
            label_cost = np.zeros((centers.shape[0], centers.shape[0]),
                                  np.float32)
            for i in range(centers.shape[0]):
                label_cost[:, i] = colordiff(centers, centers[i:i + 1])
        if truncated:
            label_cost = np.maximum(1, label_cost)
        label_cost = (label_cost * lamda).astype(np.float32)

    if verbose:
        print("=================")
        print("Minimizing graph:")
        print("Nodes: %d, edges: %d, labels: %d" % \
              (unary_cost.shape[0], edges.shape[0], label_cost.shape[0]))
        print("UnarySq: %s, LabelPotential: %s, EdgeCost: %s" % \
              (unary_sq, label_potential, (gamma is not None)))
        print("#################")

    with Timer('Edge Cost'):
        diff = ((data[edges[:, 0]] - data[edges[:, 1]])**2).sum(axis=1)
        if gamma is not None and type(gamma) in [int, float]:
            edge_costs = np.exp(-gamma * diff).astype(np.float32)
        elif gamma == 'auto':
            edge_costs = np.exp(-diff.mean() * diff).astype(np.float32)
        elif gamma == 'color':
            edge_costs = 1. / (1. +
                               colordiff(data[edges[:, 0]], data[edges[:, 1]]))
            edge_costs = edge_costs.astype(np.float32)
        else:
            edge_costs = np.ones(edges.shape[0], dtype=np.float32)

    with Timer('Minimize'):
        if label_cost.shape[0] == 2:
            labels = solve_binary(edges, unary_cost, edge_costs, label_cost)
        else:
            labels = solve_aexpansion(edges, unary_cost, edge_costs,
                                      label_cost)

    if return_centers:
        return labels, label_cost, centers

    return labels, label_cost