def test_entropy(self):
     """Check some entropies properties"""
     buffer = NumericRingBuffer(10000)
     self.assertEqual(buffer.shannon_entropy(), 0)
     buffer.append(10)
     self.assertEqual(buffer.shannon_entropy(), 0)
     import random
     for i in range(10000):
         buffer.append(random.randint(0,1000))
     self.assertTrue(buffer.shannon_entropy(2) != 0)
     self.assertTrue(buffer.shannon_entropy(10) != 0)
     self.assertTrue(buffer.shannon_entropy(2) < math.log(1000, 2))
 def test_probability(self):
     buffer = NumericRingBuffer(10)
     self.assertEqual(buffer.p_x(0), 0)
     buffer.append(42)
     self.assertEqual(buffer.p_x(42), 1)
     buffer.append(1234567890)
     self.assertEqual(buffer.p_x(42), 0.5)
 def test_mean(self):
     buffer = NumericRingBuffer(10)
     self.assertEqual(buffer.mean(), 0)
     buffer.append(42)
     self.assertEqual(buffer.mean(), 42.0)
     buffer.append(5)
     self.assertEqual(buffer.mean(), 23.5)
    def __init__(self, output_folder):
        """Constructor. 'Counter' is the number of values we have added
           so far."""

        self.values = NumericRingBuffer(BUFFER_SIZE)
        self.nodes = []
        self.counter = 0
        self.output = open(output_folder + '/anomalies.dat', 'wb')
        self.orig = open(output_folder + '/original-serie.dat', 'wb')
 def test_distribution(self):
     buffer = NumericRingBuffer(100)
     for i in range(100):
         buffer.append(100 - i)
     self.assertEqual(buffer.percentage(0), 1)
     self.assertEqual(buffer.percentage(90), 91)
     self.assertEqual(buffer.percentage(100), 100)
     self.assertEqual(buffer.percentage(101), 100)
 def test_variance(self):
     buffer = NumericRingBuffer(10)
     self.assertEqual(buffer.variance(), 0)
 def test_expected_value(self):
     buffer = NumericRingBuffer(10)
     self.assertEqual(buffer.expected_value(), 0)
     buffer.append(42)
     self.assertEqual(buffer.expected_value(), 42)
     buffer.append(21)
     self.assertEqual(buffer.expected_value(), 31.5)
     buffer.append(21)
     self.assertEqual(buffer.expected_value(), 28)
class HierarchicalClassifier(object):

    def __init__(self, output_folder):
        """Constructor. 'Counter' is the number of values we have added
           so far."""

        self.values = NumericRingBuffer(BUFFER_SIZE)
        self.nodes = []
        self.counter = 0
        self.output = open(output_folder + '/anomalies.dat', 'wb')
        self.orig = open(output_folder + '/original-serie.dat', 'wb')

    def __del__(self):
        """Destructor. Properly close opened file descriptors."""

        self.output.close()
        self.orig.close()

    def add(self, value):
        """Add a new value. We store the current number of the value in the
           map of metadata in 'n' key."""

        self.values.append(value)
        self.counter += 1
        if self.values.size > 1:
            self.orig.write(str(value) + '\n')
            vector = [
                self.values.mean(),
                self.values.shannon_entropy(),
                self.values.variance(),
                self.values.expected_value(),
            ]
            metadata = { 'n': self.counter, 'v': value }
            self.nodes.append(ClusterNode(vec=vector, meta=metadata))

    def build_set_rec(self, tree, marker):
        """Fill an array recursively from given tree."""

        if not tree:
            return []
        current = []
        if tree.id > 0:
            current = [(tree.meta['n'], marker)]
        return current + self.build_set_rec(tree.left, marker) \
            + self.build_set_rec(tree.right, marker)

    def build_sets(self, tree):
        """Build two classes from the given tree."""

        return [] + self.build_set_rec(tree.left, 0) \
            + self.build_set_rec(tree.right, 1)

    def find_anomalies(self):
        """Try to find anomalies according to what we have seen so far."""

        tree = self.hcluster(self.nodes, squared_euclidian)
        sets = self.build_sets(tree)
        sets = sorted(sets, key = lambda elt: elt[0])
        for elt in sets:
            self.output.write(str(int(elt[0])) + ' ' + str(elt[1]) + '\n')

    def hcluster(self, nodes, distance=euclidian):
        """Classif list of elements.
           Principle: each row start within it's individual cluster, then the
           matrix is processed to find closest rows until each row fits in a
           global hierarchical tree.

        Args:
           nodes:      array of ClusterNode's
           distance:  function computing distance between 2 vectors"""

        distances = {}  # cache of (v, w) distances
        currentclustid = -1

        # clusters are initially just the individual rows
        clust = [ClusterNode(vec=array(nodes[i].vec), id=i,
                             meta=nodes[i].meta) \
                     for i in range(len(nodes))]

        while len(clust) > 1:

            print('%d remaining clusters' % len(clust))
            lowestpair = (0, 1)
            closest = distance(clust[0].vec, clust[1].vec)

            # loop through every pair looking for the smallest distance
            # v_id and w_id are made local variable to avoid slow lookup
            # several times. The try/except statement is prefered as well
            # for performance issues (compared to `key not in distances`)
            for i in range(len(clust)):
                for j in range(i + 1, len(clust)):
                    v_id = clust[i].id
                    w_id = clust[j].id
                    try:
                        d = distances[(v_id, w_id)]
                    except KeyError:
                        distances[(v_id, w_id)] = \
                            distance(clust[i].vec, clust[j].vec)
                        d = distances[(v_id, w_id)]
                    if d < closest:
                        closest = d
                        lowestpair = (i, j)

            # calculate the average of the two clusters
            merged_vector = merge_vectors(clust[lowestpair[0]].vec,
                                          clust[lowestpair[1]].vec)

            # create the new cluster
            newcluster = ClusterNode(array(merged_vector),
                                     left=clust[lowestpair[0]],
                                     right=clust[lowestpair[1]],
                                     distance=closest,
                                     id=currentclustid)

            # cluster ids that weren't in the original set are negative
            currentclustid -= 1
            del clust[lowestpair[1]]
            del clust[lowestpair[0]]
            clust.append(newcluster)

        return clust[0]