def test_bad_arguments():
    X, y = make_blobs(100, random_state=42)

    mst = MSTClustering()
    assert_raises_regex(ValueError,
                        "Must specify either cutoff or cutoff_frac",
                        mst.fit, X, y)

    mst = MSTClustering(cutoff=-1)
    assert_raises_regex(ValueError, "cutoff must be positive", mst.fit, X)

    mst = MSTClustering()
    msg = "Must call fit\(\) before get_graph_segments()"
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)

    mst = MSTClustering(cutoff=0, metric='precomputed')
    mst.fit(pairwise_distances(X))
    msg = "Cannot use ``get_graph_segments`` with precomputed metric."
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)
示例#2
0
def test_bad_arguments():
    X, y = make_blobs(100, random_state=42)

    mst = MSTClustering()
    assert_raises_regex(ValueError,
                        "Must specify either cutoff or cutoff_frac", mst.fit,
                        X, y)

    mst = MSTClustering(cutoff=-1)
    assert_raises_regex(ValueError, "cutoff must be positive", mst.fit, X)

    mst = MSTClustering()
    msg = "Must call fit\(\) before get_graph_segments()"
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)

    mst = MSTClustering(cutoff=0, metric='precomputed')
    mst.fit(pairwise_distances(X))
    msg = "Cannot use ``get_graph_segments`` with precomputed metric."
    assert_raises_regex(ValueError, msg, mst.get_graph_segments)
示例#3
0
def MST_clustering(filename):
    with open(filename, 'r') as f:
        words = f.readlines()
    words = [word.rstrip() for word in words if len(word) > 4]
    words = np.asarray(words)
    jac_similarity = np.array([[jaccard(w1, w2) for w1 in words[:500]]
                               for w2 in words[:500]])

    #pdb.set_trace()
    mst = MSTClustering(min_cluster_size=10,
                        cutoff_scale=1)  # cut-off scale ??
    mst.fit(jac_similarity)
    mst_matrix = mst.full_tree_

    X_tsne = TSNE(learning_rate=100).fit_transform(mst_matrix.todense())
    labels = mst.labels_
    pdb.set_trace()
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels)
    #plot_mst(mst)
    plt.show()
示例#4
0
def get_mst(dataframe):
    model = MSTClustering(cutoff_scale=2)

    model.fit(dataframe)
    return model.labels_
示例#5
0
model = MSTClustering(cutoff_scale=2, approximate=False)
labels = model.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='rainbow')
plt.show()

plot_minimum_spanning_tree(model)
plt.show()

rng = np.random.RandomState(int(100 * y[-1]))
noise = -14 + 28 * rng.rand(200, 2)

X_noisy = np.vstack([X, noise])
y_noisy = np.concatenate([y, np.full(200, -1, dtype=int)])

plt.scatter(X_noisy[:, 0], X_noisy[:, 1], c='lightblue', cmap='spectral_r')
plt.xlim(-15, 15)
plt.ylim(-15, 15)

plt.show()

noisy_model = MSTClustering(cutoff_scale=1)
noisy_model.fit(X_noisy)
plot_minimum_spanning_tree(noisy_model)
plt.show()

noisy_model = MSTClustering(cutoff_scale=1, min_cluster_size=10)
noisy_model.fit(X_noisy)
plot_minimum_spanning_tree(noisy_model)
plt.show()