Exemplo n.º 1
0
 def test_update_point_less_points(self, PointSet):
     ps = PointSet
     fp = FastPair()
     for p in ps[:9]:
         fp += p
     assert fp.initialized is False
     old = ps[0]  # Just grab the first point...
     new = rand_tuple(len(ps[0]))
     res = fp._update_point(old, new)
     assert len(fp) == 1
Exemplo n.º 2
0
 def test_merge_closest(self):
     # This needs to be 'fleshed' out more... lots of things to test here
     random.seed(1234)
     ps = PointSet(d=4)
     fp = FastPair().build(ps)
     # fp2 = FastPair().build(ps)
     n = len(ps)
     while n >= 2:
         dist, (a, b) = fp.closest_pair()
         new = interact(a, b)
         fp -= b  # Drop b
         fp._update_point(a, new)
         n -= 1
     assert len(fp) == 1 == n
     points = [(0.69903599809571437, 0.52457534006594131,
                0.7614753848101149, 0.37011695654655385)]
     assert all_close(fp.points[0], points[0])
     # Should have < 2 points now...
     with pytest.raises(ValueError):
         fp.closest_pair()
Exemplo n.º 3
0
 def test_update_point(self, PointSet):
     # Still failing sometimes...
     ps = PointSet
     fp = FastPair().build(ps)
     assert len(fp) == len(ps)
     old = ps[0]  # Just grab the first point...
     new = rand_tuple(len(ps[0]))
     res = fp._update_point(old, new)
     assert old not in fp
     assert new in fp
     assert len(fp) == len(ps)  # Size shouldn't change
     l = [(fp.dist(a, b), b) for a, b in zip(cycle([new]), ps)]
     res = min(l, key=itemgetter(0))
     neigh = fp.neighbors[new]
Exemplo n.º 4
0
 def test_merge_closest(self):
     # This needs to be 'fleshed' out more... lots of things to test here
     random.seed(1234)
     ps = [rand_tuple(4) for _ in range(50)]
     fp = FastPair().build(ps)
     # fp2 = FastPair().build(ps)
     n = len(ps)
     while n >= 2:
         dist, (a, b) = fp.closest_pair()
         new = interact(a, b)
         fp -= b  # Drop b
         fp._update_point(a, new)
         n -= 1
     assert len(fp) == 1 == n
     points = [(
         0.69903599809571437,
         0.52457534006594131,
         0.7614753848101149,
         0.37011695654655385,
     )]
     assert all_close(fp.points[0], points[0])
     # Should have < 2 points now...
     with pytest.raises(ValueError):
         fp.closest_pair()
Exemplo n.º 5
0
 def test_update_point(self):
     # Still failing sometimes...
     ps = PointSet()
     fp = FastPair().build(ps)
     assert len(fp) == len(ps)
     old = ps[0]  # Just grab the first point...
     new = rand_tuple(len(ps[0]))
     res = fp._update_point(old, new)
     assert old not in fp
     assert new in fp
     assert len(fp) == len(ps)  # Size shouldn't change
     l = [(fp.dist(a, b), b) for a, b in zip(cycle([new]), ps)]
     res = min(l, key=itemgetter(0))
     neigh = fp.neighbors[new]
     assert abs(res[0] - neigh["dist"]) < 1e-8
     assert res[1] == neigh["neigh"]
Exemplo n.º 6
0
class AddC(object):
    """Implements the AddC clustering algorithm.

    For each data point arriving, the closest centroid to the incoming point
    is moved towards the point. If there are more than `kmax` centroids,
    then the two closest centroids are merged. This results in the creation of
    a redundant centroid; the redundant centroid is then set equal to the new
    data point. At any time, the data-structure can be queried for the current
    set of centroids/clusters, or updated with additional data points.
    """
    def __init__(self,
                 kmax=100,
                 dist=kernel_dist(gaussian),
                 centroid_factory=KernelCentroid):
        """Initialize an empty FastPair data-structure.

        Parameters
        ----------
        kmax : int, default=100
            The maximum number of cluster centroids to store (i.e., size of
            memory). This parameter controls the 'scale' of the desired
            solution, such that larger values of `kmax` will lead to a higher
            resolution cluster solution.
        """
        self.kmax = kmax
        self.npoints = 0
        self.centroid_factory = centroid_factory
        self.fastpair = FastPair(10, dist=dist)

    def __add__(self, p):
        """Add a point to the AddC sketch."""
        c = self.centroid_factory(p)  # Create an 'empty' centroid at point `p`
        self._step_one(c)
        self._step_two()
        self._step_three(c)
        self.npoints += 1  # Update count of points seen so far
        return self

    def __len__(self):
        """Number of points in the AddC sketch."""
        return len(self.fastpair)

    def __call__(self):
        """Return the current set of cluster centroids."""
        return self.centroids

    def __contains__(self, p):
        """Test if a given cluster centroid is in the AddC sketch."""
        return p in self.fastpair

    def __iter__(self):
        return iter(self.fastpair)

    def _step_one(self, c):
        # Step 1: Move the closest centroid towards the point
        if len(self.fastpair) > 0:
            # Single pass through list of neighbor points... this could also
            # be sped up with a spatial index, though harder to do with
            # kernel-induced distances
            # Alternatively, if it was possible to insert the new data point
            # _before_ querying for the closest pair (`step_two`), then we
            # could do it that way...
            old = min(self.fastpair.sdist(c), key=itemgetter(0))[1]
            self.fastpair._update_point(old, old.add(c))

    def _step_two(self):
        # Step 2: Merge the two closest centroids
        if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1:
            dist, (a, b) = self.fastpair.closest_pair()
            self.fastpair -= b
            self.fastpair._update_point(a, a.merge(b))  # Update point `a`

    def _step_three(self, c):
        # Step 3: Set redundant centroid equal to new point
        self.fastpair += c

    def batch(self, points):
        # No checks, no nothing... just batch processing, pure and simple
        for point in points:
            self += point
        return self

    def trim(self, p=0.01):
        """Return only clusters over threshold."""
        sub = [x.size for x in self if x.size > 0]
        t = (sum(sub) / len(sub)) * p
        return [x for x in self if x.size >= t]

    @property
    def centroids(self):
        """For plotting."""
        return [c.center for c in self.fastpair]
Exemplo n.º 7
0
class AddC(object):
    """Implements the AddC clustering algorithm.

    For each data point arriving, the closest centroid to the incoming point
    is moved towards the point. If there are more than `kmax` centroids,
    then the two closest centroids are merged. This results in the creation of
    a redundant centroid; the redundant centroid is then set equal to the new
    data point. At any time, the data-structure can be queried for the current
    set of centroids/clusters, or updated with additional data points.
    """
    def __init__(self, kmax=100, dist=kernel_dist(gaussian),
                 centroid_factory=KernelCentroid):
        """Initialize an empty FastPair data-structure.

        Parameters
        ----------
        kmax : int, default=100
            The maximum number of cluster centroids to store (i.e., size of
            memory). This parameter controls the 'scale' of the desired
            solution, such that larger values of `kmax` will lead to a higher
            resolution cluster solution.
        """
        self.kmax = kmax
        self.npoints = 0
        self.centroid_factory = centroid_factory
        self.fastpair = FastPair(10, dist=dist)

    def __add__(self, p):
        """Add a point to the AddC sketch."""
        c = self.centroid_factory(p)  # Create an 'empty' centroid at point `p`
        self._step_one(c)
        self._step_two()
        self._step_three(c)
        self.npoints += 1  # Update count of points seen so far
        return self

    def __len__(self):
        """Number of points in the AddC sketch."""
        return len(self.fastpair)

    def __call__(self):
        """Return the current set of cluster centroids."""
        return self.centroids

    def __contains__(self, p):
        """Test if a given cluster centroid is in the AddC sketch."""
        return p in self.fastpair

    def __iter__(self):
        return iter(self.fastpair)

    def _step_one(self, c):
        # Step 1: Move the closest centroid towards the point
        if len(self.fastpair) > 0:
            # Single pass through list of neighbor points... this could also
            # be sped up with a spatial index, though harder to do with
            # kernel-induced distances
            # Alternatively, if it was possible to insert the new data point
            # _before_ querying for the closest pair (`step_two`), then we
            # could do it that way...
            old = min(self.fastpair.sdist(c), key=itemgetter(0))[1]
            self.fastpair._update_point(old, old.add(c))

    def _step_two(self):
        # Step 2: Merge the two closest centroids
        if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1:
            dist, (a, b) = self.fastpair.closest_pair()
            self.fastpair -= b
            self.fastpair._update_point(a, a.merge(b))  # Update point `a`

    def _step_three(self, c):
        # Step 3: Set redundant centroid equal to new point
        self.fastpair += c

    def batch(self, points):
        # No checks, no nothing... just batch processing, pure and simple
        for point in points:
            self += point
        return self

    def trim(self, p=0.01):
        """Return only clusters over threshold."""
        sub = [x.size for x in self if x.size > 0]
        t = (sum(sub)/len(sub)) * p
        return [x for x in self if x.size >= t]

    @property
    def centroids(self):
        """For plotting."""
        return [c.center for c in self.fastpair]