예제 #1
0
 def check_maxdists_Q_linkage(self, method):
     # Tests maxdists(Z) on the Q data set
     X = hierarchy_test_data.Q_X
     Z = linkage(X, method)
     MD = maxdists(Z)
     expectedMD = calculate_maximum_distances(Z)
     assert_allclose(MD, expectedMD, atol=1e-15)
예제 #2
0
 def check_maxdists_Q_linkage(self, method):
     # Tests maxdists(Z) on the Q data set
     X = hierarchy_test_data.Q_X
     Z = linkage(X, method)
     MD = maxdists(Z)
     expectedMD = calculate_maximum_distances(Z)
     assert_allclose(MD, expectedMD, atol=1e-15)
예제 #3
0
def find_optimal_t_cluster(L, matrix_fun, num_of_clusters):
    max_silhuete = [-1, 0]
    arr = []

    max_val = (floor(max(sch.maxdists(L)) * 100) / 100.0) * 0.66
    t_f = max_val

    while True:
        predictions = sch.fcluster(L, t=t_f, criterion="distance").ravel()
        score = round(
            silhouette_score(matrix_fun, predictions, metric="cityblock"), 3)
        arr.append((score, round(t_f, 4)))

        num_of_c = len(set(predictions))
        if num_of_c == num_of_clusters:
            max_silhuete[0] = score
            max_silhuete[1] = round(t_f, 4)
            break
        elif num_of_c < num_of_clusters:
            t_f = t_f - (t_f / 2)
        else:
            t_f = t_f + (t_f / 2)
        # print(t_f)

    print("The best value for t is t =", max_silhuete[1])
    return arr, t_f
예제 #4
0
 def check_maxdists_Q_linkage(self, method):
     # Tests maxdists(Z) on the Q data set
     X = eo['Q-X']
     Y = pdist(X)
     Z = linkage(X, method)
     MD = maxdists(Z)
     expectedMD = calculate_maximum_distances(Z)
     assert_allclose(MD, expectedMD, atol=1e-15)
예제 #5
0
 def check_fcluster_monocrit(self, t, criterion):
     # Tests fcluster(Z, criterion='monocrit'/'maxclust_monocrit', t=t,
     # monicrit=maxdists(Z)) on a random 3-cluster data set.
     expectedT = np.int_(eo['fcluster-%s-%d' % (criterion, t)])
     X = eo['Q-X']
     Y = pdist(X)
     Z = linkage(Y)
     T = fcluster(Z, criterion=criterion, t=t, monocrit=maxdists(Z))
     assert_(is_isomorphic(T, expectedT))
예제 #6
0
 def test_maxdists_one_cluster_linkage(self):
     # Tests maxdists(Z) on linkage with one cluster.
     Z = np.asarray([[0, 1, 0.3, 4]], dtype=np.double)
     MD = maxdists(Z)
     expectedMD = calculate_maximum_distances(Z)
     assert_allclose(MD, expectedMD, atol=1e-15)
예제 #7
0
 def check_fcluster_maxclust_monocrit(self, t):
     expectedT = hierarchy_test_data.fcluster_maxclust[t]
     Z = single(hierarchy_test_data.Q_X)
     T = fcluster(Z, t, criterion='maxclust_monocrit', monocrit=maxdists(Z))
     assert_(is_isomorphic(T, expectedT))
예제 #8
0
# Plot the clustermap
# Save the returned object for further plotting
mclust = sns.clustermap(m,
                        linewidths=0,
                        cmap=plt.get_cmap('RdBu'),
                        vmax=1,
                        vmin=-1,
                        figsize=(14, 14),
                        row_linkage=l,
                        col_linkage=l)

# In[ ]:

# Threshold 1: median of the
# distance thresholds computed by scipy
t = np.median(hierarchy.maxdists(l))

# In[ ]:

# Plot the clustermap
# Save the returned object for further plotting
mclust = sns.clustermap(m,
                        linewidths=0,
                        cmap=plt.get_cmap('RdBu'),
                        vmax=1,
                        vmin=-1,
                        figsize=(12, 12),
                        row_linkage=l,
                        col_linkage=l)

# Draw the threshold lines
scaler.fit(df)
data = scaler.transform(df)

#Applying the clustering algorithm using average distance method
z = linkage(data, "average")

#Calculating cophenet correlation coefficient
c, coph_dists = cophenet(z, pdist(data))
print('Cophenet Correlation coefficient = ', c)
print('Cophenet pairwise distances = ', coph_dists)

#Printing the first two points merged and the distance between them
print("1st Cluster is ", z[0])

#Distance array
m = maxdists(z)
print("Distance Array ", m)

#Plotting the full dendogram
plt.figure(figsize=(30, 15))
plt.title('Dendogram for Flag data')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=10.,  # font size for the x axis labels
)
plt.show()

#Plotting a truncated dendogram showing last 12 cluster iterations
예제 #10
0
 def test_maxdists_one_cluster_linkage(self):
     # Tests maxdists(Z) on linkage with one cluster.
     Z = np.asarray([[0, 1, 0.3, 4]], dtype=np.double)
     MD = maxdists(Z)
     expectedMD = calculate_maximum_distances(Z)
     assert_allclose(MD, expectedMD, atol=1e-15)
예제 #11
0
 def check_fcluster_maxclust_monocrit(self, t):
     expectedT = hierarchy_test_data.fcluster_maxclust[t]
     Z = single(hierarchy_test_data.Q_X)
     T = fcluster(Z, t, criterion='maxclust_monocrit', monocrit=maxdists(Z))
     assert_(is_isomorphic(T, expectedT))
예제 #12
0
    ax.set_aspect(1./ax.get_data_ratio())

    #### figure 2 #####
    uniqueLabels = np.sort(np.unique(case1Labels))
    centroids = np.array([case1[np.where(case1Labels == i)[0],:].mean(axis=0) for i in uniqueLabels])

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ncluster = 27
    y = pdist(centroids)
    method = 'centroid'#'average'
    z = hierarchy.linkage(y,'average')
    #t = hierarchy.fcluster(27,criterion='maxclust')

    ## computes the max distance between any cluster and ea non singleton cluster
    print 'max dists', hierarchy.maxdists(z)
    

    ## inconsistancy
    r = hierarchy.inconsistent(z)
    print 'r',r
    #print 'max inconsts', hierarchy.maxinconsts(z,r,i)
    print 'z',z
    #print 'blah', z[:,2] - np.array(z[1:,2].tolist()+[0])
    print z[:,2]
    print np.hstack([z[1:,2],[0]])
    levelDiffs = np.abs(z[:,2] - np.hstack([z[1:,2],[0]]))
    levelDiffMeans = z[:,2]# - 0.001 #np.hstack([z[1:,2],[0]]) / 2.0#z[:,2] + np.hstack([z[1:,2],[0]]) / 2.0

    print 'diffs',levelDiffs*100
    diffInds = np.argsort(levelDiffs)
예제 #13
0
def plot_corr_cluster(m, method=1, **kargs):
    sns.set_style('white')

    l = fst.linkage(m, method='average')

    if method == 1:
        # Threshold 1: MATLAB-like behavior
        t = 0.7*max(l[:, 2])
    elif method == 2:
        t = np.median(hierarchy.maxdists(l))
    elif method == 3:
        t= mquantiles(hierarchy.maxdists(l), prob=0.75)[0]
    else:
        raise RuntimeError('no such method')

    # Plot the clustermap
    # Save the returned object for further plotting
    mclust = sns.clustermap(m,
                            linewidths=0,
                            cmap=plt.get_cmap('RdBu'),
                            vmax=1,
                            vmin=-1,
                            row_linkage=l,
                            col_linkage=l,
                            **kargs)

    # Draw the threshold lines
    mclust.ax_col_dendrogram.hlines(t,
                                    0,
                                    m.shape[0]*10,
                                    colors='g',
                                    linewidths=2,
                                    zorder=1)
    mclust.ax_row_dendrogram.vlines(t,
                                    0,
                                    m.shape[0]*10,
                                    colors='g',
                                    linewidths=2,
                                    zorder=1)

    # Extract the clusters
    clusters = hierarchy.fcluster(l, t, 'distance')
    for c in set(clusters):
        # Retrieve the position in the clustered matrix
        index = [x for x in range(m.shape[0])
                 if mclust.data2d.columns[x] in m.index[clusters == c]]
        # No singletons, please
        if len(index) == 1:
            continue

        # Draw a rectangle around the cluster
        mclust.ax_heatmap.add_patch(
            patches.Rectangle(
                (min(index),
                 m.shape[0] - max(index) - 1),
                len(index),
                len(index),
                facecolor='none',
                edgecolor='g',
                lw=3)
        )

    plt.title('Cluster matrix')