def __init__(self, data): drange = self.__diameter(data) dcount = data.shape[0] # run clustering clusterer = hcluster(compute_full_tree=True) clusterer.fit(data.copy()) hc_children = clusterer.children_ # setup leaf clusters clusters = {i: Cluster(i, None, None, data=np.array([data[i, :]]), count=1, range=0) for i in range(data.shape[0])} leaves = {} # setup tree if drange != 0: minrange = 1 for idx in range(hc_children.shape[0]): children = hc_children[idx] left_child, right_child = clusters[children[0]], clusters[children[1]] id = idx + data.shape[0] cluster = Cluster(id, left_child, right_child) cluster.data = np.vstack((left_child.data, right_child.data)) cluster.range = self.__diameter(cluster.data) / (drange) if cluster.range < minrange: minrange = cluster.range cluster.count = left_child.count + right_child.count left_child.parent = cluster right_child.parent = cluster clusters[id] = cluster for id in clusters: cluster = clusters[id] leaf = (cluster.count == 1) while (cluster.parent is not None and cluster.parent.range == 0): cluster = cluster.parent clusters[id] = cluster if cluster.range == 0 and cluster.parent is not None and cluster.parent.range != 0: if cluster.items is None: cluster.items = set() if leaf: leaves[id] = cluster cluster.items.add(id) # clear "clusters" from original leaves # set true leaves children to empty cids = clusters.keys() for id in cids: cluster = clusters[id] if cluster.id != id: del clusters[id] elif cluster.range == 0: cluster.left, cluster.right = None, None for id in clusters: cluster = clusters[id] if cluster.range == 0: if minrange < 0.1: cluster.range = minrange else: cluster.range = 1e-4 cluster.count = cluster.count * 1.0 / dcount else: big_cluster = Cluster(data.shape[0], None, None, data=np.array(data[0, :]), count=data.shape[0], range=1e-4) big_cluster.items = set(range(data.shape[0])) for i in range(data.shape[0]): leaves[i] = big_cluster clusters = {data.shape[0]: big_cluster} # setup leaf levels for lid in leaves: lcluster = leaves[lid] lcluster.level = 1 # compute all tree levels computed = set([leaves[lid].id for lid in leaves.keys()]) for lid in leaves: cluster = leaves[lid] while (cluster is not None and (cluster.right is None or cluster.right.id in computed) and (cluster.left is None or cluster.left.id in computed)): if (cluster.id not in computed): cluster.level = max(cluster.right.level, cluster.left.level) + 1 computed.add(cluster.id) cluster = cluster.parent del computed # set tree state variables biggest, max_level = 0, 0 test_cl = leaves[0] while (test_cl.parent is not None): test_cl = test_cl.parent self.root = test_cl self.max_level = test_cl.level self.data = data self.clusters = clusters self.leaves = leaves self.__drange = drange self.__dcount = dcount
def __init__(self, data): drange = self.__diameter(data) dcount = data.shape[0] # run clustering clusterer = hcluster(compute_full_tree=True) clusterer.fit(data.copy()) hc_children = clusterer.children_ # setup leaf clusters clusters = { i: Cluster(i, None, None, data=np.array([data[i, :]]), count=1, range=0) for i in range(data.shape[0]) } leaves = {} # setup tree if drange != 0: minrange = 1 for idx in range(hc_children.shape[0]): children = hc_children[idx] left_child, right_child = clusters[children[0]], clusters[ children[1]] id = idx + data.shape[0] cluster = Cluster(id, left_child, right_child) cluster.data = np.vstack((left_child.data, right_child.data)) cluster.range = self.__diameter(cluster.data) / (drange) if cluster.range < minrange: minrange = cluster.range cluster.count = left_child.count + right_child.count left_child.parent = cluster right_child.parent = cluster clusters[id] = cluster for id in clusters: cluster = clusters[id] leaf = (cluster.count == 1) while (cluster.parent is not None and cluster.parent.range == 0): cluster = cluster.parent clusters[id] = cluster if cluster.range == 0 and cluster.parent is not None and cluster.parent.range != 0: if cluster.items is None: cluster.items = set() if leaf: leaves[id] = cluster cluster.items.add(id) # clear "clusters" from original leaves # set true leaves children to empty cids = clusters.keys() for id in cids: cluster = clusters[id] if cluster.id != id: del clusters[id] elif cluster.range == 0: cluster.left, cluster.right = None, None for id in clusters: cluster = clusters[id] if cluster.range == 0: if minrange < 0.1: cluster.range = minrange else: cluster.range = 1e-4 cluster.count = cluster.count * 1.0 / dcount else: big_cluster = Cluster(data.shape[0], None, None, data=np.array(data[0, :]), count=data.shape[0], range=1e-4) big_cluster.items = set(range(data.shape[0])) for i in range(data.shape[0]): leaves[i] = big_cluster clusters = {data.shape[0]: big_cluster} # setup leaf levels for lid in leaves: lcluster = leaves[lid] lcluster.level = 1 # compute all tree levels computed = set([leaves[lid].id for lid in leaves.keys()]) for lid in leaves: cluster = leaves[lid] while (cluster is not None and (cluster.right is None or cluster.right.id in computed) and (cluster.left is None or cluster.left.id in computed)): if (cluster.id not in computed): cluster.level = max(cluster.right.level, cluster.left.level) + 1 computed.add(cluster.id) cluster = cluster.parent del computed # set tree state variables biggest, max_level = 0, 0 test_cl = leaves[0] while (test_cl.parent is not None): test_cl = test_cl.parent self.root = test_cl self.max_level = test_cl.level self.data = data self.clusters = clusters self.leaves = leaves self.__drange = drange self.__dcount = dcount
def __init__(self, data): # NOTE: assumes data is 1-d numpy array pos = data.argsort() data = data[pos] data = data.reshape(data.shape[0], 1) drange = data[-1,0] - data[0,0] dcount = data.shape[0] # run clustering # clusterer = hcluster(compute_full_tree=True) clusterer = hcluster(compute_full_tree=True, linkage='complete') clusterer.fit(data) hc_children = clusterer.children_ # setup leaf clusters clusters = {i: Cluster(i, None, None, data=(data[i, 0], data[i, 0]), count=1, range=0) for i in range(data.shape[0])} leaves = {} # setup tree if drange != 0: minrange = 1 for idx in range(hc_children.shape[0]): children = hc_children[idx] lc, rc = clusters[children[0]], clusters[children[1]] check = 0 if lc.data[-1] < rc.data[0] else 1 left_child, right_child = clusters[children[check]], clusters[children[1-check]] id = idx + data.shape[0] cluster = Cluster(id, left_child, right_child) cluster.data = (min(left_child.data), max(right_child.data)) cluster.range = (cluster.data[-1] - cluster.data[0]) / (drange) if cluster.range < minrange: minrange = cluster.range cluster.count = left_child.count + right_child.count left_child.parent = cluster right_child.parent = cluster clusters[id] = cluster for id in clusters: cluster = clusters[id] leaf = (cluster.count == 1) while (cluster.parent is not None and cluster.parent.range == 0): cluster = cluster.parent clusters[id] = cluster if cluster.range == 0 and cluster.parent is not None and cluster.parent.range != 0: if cluster.items is None: cluster.items = set() if leaf: leaves[id] = cluster cluster.items.add(id) cids = clusters.keys() for id in cids: cluster = clusters[id] if cluster.id != id: del clusters[id] elif cluster.range == 0: cluster.left, cluster.right = None, None for id in clusters: cluster = clusters[id] if cluster.range == 0: if minrange < 0.1: cluster.range = minrange else: cluster.range = 1e-4 cluster.count = cluster.count * 1.0 / dcount else: big_cluster = Cluster(data.shape[0], None, None, data=(data[0,0], data[0,0]), count=data.shape[0], range=1e-4) big_cluster.items = set(range(data.shape[0])) for i in range(data.shape[0]): leaves[i] = big_cluster clusters = {data.shape[0]: big_cluster} # setup leaf levels for lid in leaves: lcluster = leaves[lid] lcluster.level = 1 # compute all tree levels computed = set([leaves[lid].id for lid in leaves.keys()]) for lid in leaves: cluster = leaves[lid] while (cluster is not None and \ ((cluster.data[0] == cluster.data[1]) or \ (cluster.right.id in computed and cluster.left.id in computed))): if (cluster.id not in computed): cluster.level = max(cluster.right.level, cluster.left.level) + 1 computed.add(cluster.id) cluster = cluster.parent del computed # set tree state variables biggest, max_level = 0, 0 for id in clusters: if id > biggest: biggest = id if clusters[id].level > max_level: max_level = clusters[id].level self.data = data self.root = clusters[biggest] self.max_level = max_level self.clusters = clusters self.leaves = leaves self.__dindex = pos self.__findex = np.argsort(pos) self.__drange = drange self.__dcount = dcount
def __init__(self, data): # NOTE: assumes data is 1-d numpy array pos = data.argsort() data = data[pos] data = data.reshape(data.shape[0], 1) drange = data[-1, 0] - data[0, 0] dcount = data.shape[0] # run clustering # clusterer = hcluster(compute_full_tree=True) clusterer = hcluster(compute_full_tree=True, linkage='complete') clusterer.fit(data) hc_children = clusterer.children_ # setup leaf clusters clusters = { i: Cluster(i, None, None, data=(data[i, 0], data[i, 0]), count=1, range=0) for i in range(data.shape[0]) } leaves = {} # setup tree if drange != 0: minrange = 1 for idx in range(hc_children.shape[0]): children = hc_children[idx] lc, rc = clusters[children[0]], clusters[children[1]] check = 0 if lc.data[-1] < rc.data[0] else 1 left_child, right_child = clusters[children[check]], clusters[ children[1 - check]] id = idx + data.shape[0] cluster = Cluster(id, left_child, right_child) cluster.data = (min(left_child.data), max(right_child.data)) cluster.range = (cluster.data[-1] - cluster.data[0]) / (drange) if cluster.range < minrange: minrange = cluster.range cluster.count = left_child.count + right_child.count left_child.parent = cluster right_child.parent = cluster clusters[id] = cluster for id in clusters: cluster = clusters[id] leaf = (cluster.count == 1) while (cluster.parent is not None and cluster.parent.range == 0): cluster = cluster.parent clusters[id] = cluster if cluster.range == 0 and cluster.parent is not None and cluster.parent.range != 0: if cluster.items is None: cluster.items = set() if leaf: leaves[id] = cluster cluster.items.add(id) cids = clusters.keys() for id in cids: cluster = clusters[id] if cluster.id != id: del clusters[id] elif cluster.range == 0: cluster.left, cluster.right = None, None for id in clusters: cluster = clusters[id] if cluster.range == 0: if minrange < 0.1: cluster.range = minrange else: cluster.range = 1e-4 cluster.count = cluster.count * 1.0 / dcount else: big_cluster = Cluster(data.shape[0], None, None, data=(data[0, 0], data[0, 0]), count=data.shape[0], range=1e-4) big_cluster.items = set(range(data.shape[0])) for i in range(data.shape[0]): leaves[i] = big_cluster clusters = {data.shape[0]: big_cluster} # setup leaf levels for lid in leaves: lcluster = leaves[lid] lcluster.level = 1 # compute all tree levels computed = set([leaves[lid].id for lid in leaves.keys()]) for lid in leaves: cluster = leaves[lid] while (cluster is not None and \ ((cluster.data[0] == cluster.data[1]) or \ (cluster.right.id in computed and cluster.left.id in computed))): if (cluster.id not in computed): cluster.level = max(cluster.right.level, cluster.left.level) + 1 computed.add(cluster.id) cluster = cluster.parent del computed # set tree state variables biggest, max_level = 0, 0 for id in clusters: if id > biggest: biggest = id if clusters[id].level > max_level: max_level = clusters[id].level self.data = data self.root = clusters[biggest] self.max_level = max_level self.clusters = clusters self.leaves = leaves self.__dindex = pos self.__findex = np.argsort(pos) self.__drange = drange self.__dcount = dcount