def test_call_and_closest_pair(self, PointSet): ps = PointSet fp = FastPair().build(ps) cp = fp.closest_pair() bf = fp.closest_pair_brute_force() assert fp() == cp assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1]
def test_call_and_closest_pair(self): ps = PointSet() fp = FastPair().build(ps) cp = fp.closest_pair() bf = fp.closest_pair_brute_force() assert fp() == cp assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1]
def test_update_point_less_points(self, PointSet): ps = PointSet fp = FastPair() for p in ps[:9]: fp += p assert fp.initialized is False old = ps[0] # Just grab the first point... new = rand_tuple(len(ps[0])) res = fp._update_point(old, new) assert len(fp) == 1
def test_all_closest_pairs(self): ps = PointSet() fp = FastPair().build(ps) cp = fp.closest_pair() bf = fp.closest_pair_brute_force() # Ordering should be the same # dc = fp.closest_pair_divide_conquer() # Maybe different ordering assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1] # Tuple comparison test = min([(fp.dist(a, b), (a, b)) for a, b in combinations(ps, r=2)], key=itemgetter(0)) assert abs(cp[0] - test[0]) < 1e-8 assert sorted(cp[1]) == sorted(test[1]) # Tuple comparison
def test_sub(self, PointSet): ps = PointSet fp = FastPair().build(ps) start = fp._find_neighbor(ps[-1]) fp -= ps[-1] end = fp._find_neighbor(start["neigh"]) assert end["neigh"] != ps[-1] # This is risky, because it might legitimately be the same...? assert start["dist"] != end["dist"] assert len(fp) == len(ps) - 1 with pytest.raises(ValueError): fp -= rand_tuple(len(ps[0]))
def test_call_and_closest_pair_min_points(self, image_array): ps = image_array fp = FastPair(dist=image_distance) for p in ps: fp += p assert fp.initialized is False assert len(fp) == 6 cp = fp.closest_pair() bf = fp.closest_pair_brute_force() assert fp() == cp assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1]
def test_sub(self): ps = PointSet() fp = FastPair().build(ps) start = fp._find_neighbor(ps[-1]) fp -= ps[-1] end = fp._find_neighbor(start["neigh"]) assert end["neigh"] != ps[-1] # This is risky, because it might legitimately be the same...? assert start["dist"] != end["dist"] assert len(fp) == len(ps)-1 with pytest.raises(ValueError): fp -= rand_tuple(len(ps[0]))
def test_all_closest_pairs(self, PointSet): ps = PointSet fp = FastPair().build(ps) cp = fp.closest_pair() bf = fp.closest_pair_brute_force() # Ordering should be the same # dc = fp.closest_pair_divide_conquer() # Maybe different ordering assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1] # Tuple comparison test = min( [(fp.dist(a, b), (a, b)) for a, b in combinations(ps, r=2)], key=itemgetter(0), ) assert abs(cp[0] - test[0]) < 1e-8 assert sorted(cp[1]) == sorted(test[1]) # Tuple comparison
def test_update_point(self, PointSet): # Still failing sometimes... ps = PointSet fp = FastPair().build(ps) assert len(fp) == len(ps) old = ps[0] # Just grab the first point... new = rand_tuple(len(ps[0])) res = fp._update_point(old, new) assert old not in fp assert new in fp assert len(fp) == len(ps) # Size shouldn't change l = [(fp.dist(a, b), b) for a, b in zip(cycle([new]), ps)] res = min(l, key=itemgetter(0)) neigh = fp.neighbors[new]
def test_find_neighbor_and_sdist(self): ps = PointSet() fp = FastPair().build(ps) rando = rand_tuple(len(ps[0])) neigh = fp._find_neighbor(rando) # Abusing find_neighbor! dist = fp.dist(rando, neigh["neigh"]) assert abs(dist - neigh["dist"]) < 1e-8 assert len(fp) == len(ps) # Make sure we didn't add a point... l = [(fp.dist(a, b), b) for a, b in zip(cycle([rando]), ps)] res = min(l, key=itemgetter(0)) assert abs(res[0] - neigh["dist"]) < 1e-8 assert res[1] == neigh["neigh"] res = min(fp.sdist(rando), key=itemgetter(0)) assert abs(neigh["dist"] - res[0]) < 1e-8 assert neigh["neigh"] == res[1]
def test_init(self): fp = FastPair() assert fp.min_points == 10 assert isinstance(fp.dist, FunctionType) assert fp.initialized is False assert len(fp.points) == 0 assert len(fp.neighbors) == 0
def test_update_point(self): # Still failing sometimes... ps = PointSet() fp = FastPair().build(ps) assert len(fp) == len(ps) old = ps[0] # Just grab the first point... new = rand_tuple(len(ps[0])) res = fp._update_point(old, new) assert old not in fp assert new in fp assert len(fp) == len(ps) # Size shouldn't change l = [(fp.dist(a, b), b) for a, b in zip(cycle([new]), ps)] res = min(l, key=itemgetter(0)) neigh = fp.neighbors[new] assert abs(res[0] - neigh["dist"]) < 1e-8 assert res[1] == neigh["neigh"]
def __init__(self, kmax=100, dist=kernel_dist(gaussian), centroid_factory=KernelCentroid): """Initialize an empty FastPair data-structure. Parameters ---------- kmax : int, default=100 The maximum number of cluster centroids to store (i.e., size of memory). This parameter controls the 'scale' of the desired solution, such that larger values of `kmax` will lead to a higher resolution cluster solution. """ self.kmax = kmax self.npoints = 0 self.centroid_factory = centroid_factory self.fastpair = FastPair(10, dist=dist)
def test_add(self, PointSet): ps = PointSet fp = FastPair() for p in ps[:9]: fp += p assert fp.initialized is False assert len(fp) == 9 for p in ps[9:]: fp += p assert fp.initialized is True
def test_cluster(self): ps = PointSet() fp = FastPair().build(ps) for i in range(len(fp)-1): # Version one dist, (a, b) = fp.closest_pair() c = interact(a, b) fp -= b # Drop b fp -= a fp += c # Order gets reversed here... d, (e, f) = min([(fp.dist(i, j), (i, j)) for i, j in combinations(ps, r=2)], key=itemgetter(0)) g = interact(e, f) assert abs(d - dist) < 1e-8 assert (a == e or b == e) and (b == f or a == f) assert c == g ps.remove(e) ps.remove(f) ps.append(g) assert contains_same(fp.points, ps) assert len(fp.points) == len(ps) == 1
def test_cluster(self, PointSet): ps = PointSet fp = FastPair().build(ps) for i in range(len(fp) - 1): # Version one dist, (a, b) = fp.closest_pair() c = interact(a, b) fp -= b # Drop b fp -= a fp += c # Order gets reversed here... d, (e, f) = min( [(fp.dist(i, j), (i, j)) for i, j in combinations(ps, r=2)], key=itemgetter(0), ) g = interact(e, f) assert abs(d - dist) < 1e-8 assert (a == e or b == e) and (b == f or a == f) assert c == g ps.remove(e) ps.remove(f) ps.append(g) assert contains_same(fp.points, ps) assert len(fp.points) == len(ps) == 1
def test_iter(self, PointSet): ps = PointSet fp = FastPair().build(ps) assert fp.min_points == 10 assert isinstance(fp.dist, FunctionType) my_iter = iter(fp) assert next(my_iter) in set(ps) assert fp[ps[0]].neigh in set(ps) try: myitem = fp[(2, 3, 4)] except KeyError as err: print(err) fp[ps[0]] = fp[ps[0]].neigh try: fp[(2, 3, 4)] = fp[ps[0]].neigh except KeyError as err: print(err)
def test_find_neighbor_and_sdist(self, PointSet): ps = PointSet fp = FastPair().build(ps) rando = rand_tuple(len(ps[0])) neigh = fp._find_neighbor(rando) # Abusing find_neighbor! dist = fp.dist(rando, neigh["neigh"]) assert abs(dist - neigh["dist"]) < 1e-8 assert len(fp) == len(ps) # Make sure we didn't add a point... l = [(fp.dist(a, b), b) for a, b in zip(cycle([rando]), ps)] res = min(l, key=itemgetter(0)) assert abs(res[0] - neigh["dist"]) < 1e-8 assert res[1] == neigh["neigh"] res = min(fp.sdist(rando), key=itemgetter(0)) assert abs(neigh["dist"] - res[0]) < 1e-8 assert neigh["neigh"] == res[1]
def test_merge_closest(self): # This needs to be 'fleshed' out more... lots of things to test here random.seed(1234) ps = PointSet(d=4) fp = FastPair().build(ps) # fp2 = FastPair().build(ps) n = len(ps) while n >= 2: dist, (a, b) = fp.closest_pair() new = interact(a, b) fp -= b # Drop b fp._update_point(a, new) n -= 1 assert len(fp) == 1 == n points = [(0.69903599809571437, 0.52457534006594131, 0.7614753848101149, 0.37011695654655385)] assert all_close(fp.points[0], points[0]) # Should have < 2 points now... with pytest.raises(ValueError): fp.closest_pair()
def test_merge_closest(self): # This needs to be 'fleshed' out more... lots of things to test here random.seed(1234) ps = [rand_tuple(4) for _ in range(50)] fp = FastPair().build(ps) # fp2 = FastPair().build(ps) n = len(ps) while n >= 2: dist, (a, b) = fp.closest_pair() new = interact(a, b) fp -= b # Drop b fp._update_point(a, new) n -= 1 assert len(fp) == 1 == n points = [( 0.69903599809571437, 0.52457534006594131, 0.7614753848101149, 0.37011695654655385, )] assert all_close(fp.points[0], points[0]) # Should have < 2 points now... with pytest.raises(ValueError): fp.closest_pair()
def test_len(self): ps = PointSet() fp = FastPair() assert len(fp) == 0 fp.build(ps) assert len(fp) == len(ps)
class AddC(object): """Implements the AddC clustering algorithm. For each data point arriving, the closest centroid to the incoming point is moved towards the point. If there are more than `kmax` centroids, then the two closest centroids are merged. This results in the creation of a redundant centroid; the redundant centroid is then set equal to the new data point. At any time, the data-structure can be queried for the current set of centroids/clusters, or updated with additional data points. """ def __init__(self, kmax=100, dist=kernel_dist(gaussian), centroid_factory=KernelCentroid): """Initialize an empty FastPair data-structure. Parameters ---------- kmax : int, default=100 The maximum number of cluster centroids to store (i.e., size of memory). This parameter controls the 'scale' of the desired solution, such that larger values of `kmax` will lead to a higher resolution cluster solution. """ self.kmax = kmax self.npoints = 0 self.centroid_factory = centroid_factory self.fastpair = FastPair(10, dist=dist) def __add__(self, p): """Add a point to the AddC sketch.""" c = self.centroid_factory(p) # Create an 'empty' centroid at point `p` self._step_one(c) self._step_two() self._step_three(c) self.npoints += 1 # Update count of points seen so far return self def __len__(self): """Number of points in the AddC sketch.""" return len(self.fastpair) def __call__(self): """Return the current set of cluster centroids.""" return self.centroids def __contains__(self, p): """Test if a given cluster centroid is in the AddC sketch.""" return p in self.fastpair def __iter__(self): return iter(self.fastpair) def _step_one(self, c): # Step 1: Move the closest centroid towards the point if len(self.fastpair) > 0: # Single pass through list of neighbor points... this could also # be sped up with a spatial index, though harder to do with # kernel-induced distances # Alternatively, if it was possible to insert the new data point # _before_ querying for the closest pair (`step_two`), then we # could do it that way... old = min(self.fastpair.sdist(c), key=itemgetter(0))[1] self.fastpair._update_point(old, old.add(c)) def _step_two(self): # Step 2: Merge the two closest centroids if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1: dist, (a, b) = self.fastpair.closest_pair() self.fastpair -= b self.fastpair._update_point(a, a.merge(b)) # Update point `a` def _step_three(self, c): # Step 3: Set redundant centroid equal to new point self.fastpair += c def batch(self, points): # No checks, no nothing... just batch processing, pure and simple for point in points: self += point return self def trim(self, p=0.01): """Return only clusters over threshold.""" sub = [x.size for x in self if x.size > 0] t = (sum(sub)/len(sub)) * p return [x for x in self if x.size >= t] @property def centroids(self): """For plotting.""" return [c.center for c in self.fastpair]
class AddC(object): """Implements the AddC clustering algorithm. For each data point arriving, the closest centroid to the incoming point is moved towards the point. If there are more than `kmax` centroids, then the two closest centroids are merged. This results in the creation of a redundant centroid; the redundant centroid is then set equal to the new data point. At any time, the data-structure can be queried for the current set of centroids/clusters, or updated with additional data points. """ def __init__(self, kmax=100, dist=kernel_dist(gaussian), centroid_factory=KernelCentroid): """Initialize an empty FastPair data-structure. Parameters ---------- kmax : int, default=100 The maximum number of cluster centroids to store (i.e., size of memory). This parameter controls the 'scale' of the desired solution, such that larger values of `kmax` will lead to a higher resolution cluster solution. """ self.kmax = kmax self.npoints = 0 self.centroid_factory = centroid_factory self.fastpair = FastPair(10, dist=dist) def __add__(self, p): """Add a point to the AddC sketch.""" c = self.centroid_factory(p) # Create an 'empty' centroid at point `p` self._step_one(c) self._step_two() self._step_three(c) self.npoints += 1 # Update count of points seen so far return self def __len__(self): """Number of points in the AddC sketch.""" return len(self.fastpair) def __call__(self): """Return the current set of cluster centroids.""" return self.centroids def __contains__(self, p): """Test if a given cluster centroid is in the AddC sketch.""" return p in self.fastpair def __iter__(self): return iter(self.fastpair) def _step_one(self, c): # Step 1: Move the closest centroid towards the point if len(self.fastpair) > 0: # Single pass through list of neighbor points... this could also # be sped up with a spatial index, though harder to do with # kernel-induced distances # Alternatively, if it was possible to insert the new data point # _before_ querying for the closest pair (`step_two`), then we # could do it that way... old = min(self.fastpair.sdist(c), key=itemgetter(0))[1] self.fastpair._update_point(old, old.add(c)) def _step_two(self): # Step 2: Merge the two closest centroids if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1: dist, (a, b) = self.fastpair.closest_pair() self.fastpair -= b self.fastpair._update_point(a, a.merge(b)) # Update point `a` def _step_three(self, c): # Step 3: Set redundant centroid equal to new point self.fastpair += c def batch(self, points): # No checks, no nothing... just batch processing, pure and simple for point in points: self += point return self def trim(self, p=0.01): """Return only clusters over threshold.""" sub = [x.size for x in self if x.size > 0] t = (sum(sub) / len(sub)) * p return [x for x in self if x.size >= t] @property def centroids(self): """For plotting.""" return [c.center for c in self.fastpair]
def test_build(self, PointSet): ps = PointSet fp = FastPair().build(ps) assert len(fp) == len(ps) assert len(fp.neighbors) == len(ps) assert fp.initialized is True
def test_len(self, PointSet): ps = PointSet fp = FastPair() assert len(fp) == 0 fp.build(ps) assert len(fp) == len(ps)
def test_contains(self, PointSet): ps = PointSet fp = FastPair() assert ps[0] not in fp fp.build(ps) assert ps[0] in fp
def test_contains(self): ps = PointSet() fp = FastPair() assert ps[0] not in fp fp.build(ps) assert ps[0] in fp