Exemplo n.º 1
0
    def test_construct_from_data(self):
        """
        Check viability and correctness of an LST constructed directly from a
        tabular dataset.
        """
        tree = dcl.construct_tree(self.dataset, self.k, prune_threshold=self.gamma)

        self._check_tree_viability(tree)
        self._check_tree_correctness(tree)
Exemplo n.º 2
0
    def test_construct_from_data(self):
        """
        Check viability and correctness of an LST constructed directly from a
        tabular dataset.
        """
        tree = dcl.construct_tree(self.dataset,
                                  self.k,
                                  prune_threshold=self.gamma)

        self._check_tree_viability(tree)
        self._check_tree_correctness(tree)
Exemplo n.º 3
0
    def test_load(self):
        """
        Check viability and correctness of an LST saved then loaded from file.
        """
        tree = dcl.construct_tree(self.dataset, self.k, prune_threshold=self.gamma)

        with tempfile.NamedTemporaryFile() as f:
            tree.save(f.name)
            tree2 = dcl.load_tree(f.name)

        self._check_tree_viability(tree2)
        self._check_tree_correctness(tree)
Exemplo n.º 4
0
    def test_load(self):
        """
        Check viability and correctness of an LST saved then loaded from file.
        """
        tree = dcl.construct_tree(self.dataset,
                                  self.k,
                                  prune_threshold=self.gamma)

        with tempfile.NamedTemporaryFile() as f:
            tree.save(f.name)
            tree2 = dcl.load_tree(f.name)

        self._check_tree_viability(tree2)
        self._check_tree_correctness(tree)
Exemplo n.º 5
0
    def setUp(self):
        ## Data parameters
        np.random.seed(451)
        self.n = 1000
        mix = (0.3, 0.5, 0.2)
        mean = (-1, 0, 1)
        stdev = (0.3, 0.2, 0.1)

        ## Tree parameters
        k = 50
        self.gamma = 5

        ## Simulate data
        membership = np.random.multinomial(self.n, pvals=mix)
        dataset = np.array([], dtype=np.float)

        for (p, mu, sigma) in zip(membership, mean, stdev):
            draw = np.random.normal(loc=mu, scale=sigma, size=p)
            dataset = np.append(dataset, draw)

        dataset = np.sort(dataset).reshape((self.n, 1))
        self.tree = dcl.construct_tree(dataset, k, self.gamma)
Exemplo n.º 6
0
    def setUp(self):
        ## Data parameters
        np.random.seed(451)
        self.n = 1000
        mix = (0.3, 0.5, 0.2)
        mean = (-1, 0, 1)
        stdev = (0.3, 0.2, 0.1)

        ## Tree parameters
        k = 50
        self.gamma = 5

        ## Simulate data
        membership = np.random.multinomial(self.n, pvals=mix)
        dataset = np.array([], dtype=np.float)

        for (p, mu, sigma) in zip(membership, mean, stdev):
            draw = np.random.normal(loc=mu, scale=sigma, size=p)
            dataset = np.append(dataset, draw)

        dataset = np.sort(dataset).reshape((self.n, 1))
        self.tree = dcl.construct_tree(dataset, k, self.gamma)
Exemplo n.º 7
0
X = np.zeros((n, p), dtype=np.float)
g = np.zeros((n, ), dtype=np.int)
b = np.cumsum((0, ) + tuple(membership))

for i, (size, mu, sigma) in enumerate(zip(membership, centers, sdev)):
    ix = range(b[i], b[i + 1])
    X[ix, :] = np.random.multivariate_normal(mu, sigma, size)
    g[ix] = i

X = np.sort(X, axis=0)

## Estimate the level set tree.
k = int(0.02 * n)
gamma = int(0.05 * n)

tree = dcl.construct_tree(X, k, prune_threshold=gamma, verbose=True)
print tree

## Retrieve cluster assignments from the tree.
labels = tree.get_clusters(method='leaf')

## Labels returned from the `get_clusters` method match the index of the
#  highest density node to which an observation belongs. Because these labels
#  are usually non-consecutive, we can reindex to make many post-processing
#  steps more natural.
new_labels = dcl.utils.reindex_cluster_labels(labels)
print "cluster counts:", np.bincount(new_labels[:, 1])

## Plot the level set tree as a dendrogram. The plot function returns a tuple
#  containing 4 objects. The first item is a matplotlib figure, which can be
#  shown and saved.
Exemplo n.º 8
0
    stay_points = list(stay_points)

    stay_points = [s.strip('\n').split(',') for s in stay_points]
    size = len(stay_points)
    for u in range(size):
        users_list.append(i)
    stay_points = np.asarray(stay_points, dtype='float32')
    total_stay_points.append(stay_points)
total_stay_points = [item for sublist in total_stay_points for item in sublist]
total_stay_points = np.asarray(total_stay_points)
# write total_stay_points.txt
total_stay_points_file = open("./Clustering/total_stay_points.txt", "w")
for itr in total_stay_points:
    total_stay_points_file.write(str(itr) + "\n")
# clustering
tree = dcl.construct_tree(total_stay_points, k=50)
labels = tree.get_clusters()
# write the Label.txt
labels_file = open("./Clustering/Label.txt", "w")
for itr in labels:
    labels_file.write(str(itr) + "\n")

# write TBHG to the TBHG.txt
tree_file = open("./Clustering/TBHG.txt", "w")
tree_str = str(tree)
tree_file.write(tree_str)

# determine the specific user in the each low-level cluster
cluster_dict = {}

for entry in labels:
Exemplo n.º 9
0
g = np.zeros((n,), dtype=np.int)
b = np.cumsum((0,) + tuple(membership))

for i, (size, mu, sigma) in enumerate(zip(membership, centers, sdev)):
    ix = range(b[i], b[i + 1])
    X[ix, :] = np.random.multivariate_normal(mu, sigma, size)
    g[ix] = i

X = np.sort(X, axis=0)


## Estimate the level set tree.
k = int(0.02 * n)
gamma = int(0.05 * n)

tree = dcl.construct_tree(X, k, prune_threshold=gamma, verbose=True)
print tree


## Retrieve cluster assignments from the tree.
labels = tree.get_clusters(method="leaf")


## Labels returned from the `get_clusters` method match the index of the
#  highest density node to which an observation belongs. Because these labels
#  are usually non-consecutive, we can reindex to make many post-processing
#  steps more natural.
new_labels = dcl.utils.reindex_cluster_labels(labels)
print "cluster counts:", np.bincount(new_labels[:, 1])

Exemplo n.º 10
0

circles = make_circles(500, factor=0.5, noise=0.06, random_state=23)
blob = make_blobs(100, centers=1, center_box=(-1.7, 1.7), cluster_std=0.1,
                  random_state=19)

X = np.vstack((circles[0], blob[0]))
print("Dataset shape:", X.shape)

with plt.style.context('ggplot'):
    fig, ax = plt.subplots(figsize=(6, 4.5))
    ax.scatter(X[:, 0], X[:, 1], c='black', s=50, alpha=0.5)
    fig.show()

import debacl as dcl
tree = dcl.construct_tree(X, k=20)

print(tree)


plot = tree.plot()
plot[0].show()

pruned_tree = tree.prune(60)
pruned_tree.plot()[0].show()

cluster_labels = pruned_tree.get_clusters()

print("Cluster labels shape:", cluster_labels.shape)