Пример #1
0
    def _test_concat(self):
        """
        Tests concatenating K datasets into a new Dampr
        """

        word1 = Dampr.memory("abcdefg")
        word1.concat(Dampr.memory("hijklmn"))

        results = sorted(list(word1.run()))
        self.assertEquals(results, list('abcdefghijklmn'))
Пример #2
0
    def intersect(self, keys, min_match=None):
        if not isinstance(keys, (list, tuple)):
            keys = [keys]

        if min_match is None:
            min_match = len(keys)

        if isinstance(min_match, float):
            min_match = int(min_match * len(keys))

        paths = read_paths(self.path, self.suffix)

        str_keys = u','.join(u'"{}"'.format(key) for key in keys)
        query = u"""
            select offset from 
            (select offset, count(*) as c 
                from key_index 
                where key in ({}) 
                group by offset) where c >= {}
            order by offset asc""".format(str_keys, min_match)

        def read_db(fname):
            db = self.open_db(fname)

            cur = db.cursor()
            cur.execute(query)
            with codecs.open(fname, encoding='utf-8') as f:
                for (offset,) in cur:
                    f.seek(offset)
                    yield f.readline()

        return Dampr.memory(paths).flat_map(read_db)
Пример #3
0
    def test_len(self):
        """
        Tests the number of items in a collection.
        """

        self.assertEquals(self.items.len().read(), [10])
        self.assertEquals(Dampr.memory([]).len().read(), [0])
Пример #4
0
    def test_stream_blocks(self):
        """
        Tests stream blocks
        """
        import heapq

        def map_topk(it):
            heap = []
            for symbol, count in it:
                heapq.heappush(heap, (count, symbol))
                if len(heap) > 2:
                    heapq.heappop(heap)

            return ((1, x) for x in heap)

        def reduce_topk(it):
            counts = (v for k, vit in it for v in vit)
            for count, symbol in heapq.nlargest(2, counts):
                yield symbol, count

        word = Dampr.memory(["supercalifragilisticexpialidociousa"])
        letter_counts = word.flat_map(lambda w: list(w)).count()

        topk = letter_counts \
                .partition_map(map_topk) \
                .partition_reduce(reduce_topk)

        results = sorted(list(topk.run()))
        self.assertEquals(results, [('a', 4), ('i', 7)])
Пример #5
0
 def test_disjoint(self):
     items2 = Dampr.memory(list(range(10))) \
             .group_by(lambda x: -x)
     output = self.items.group_by(lambda x: x) \
             .join(items2) \
             .run()
     output = [v for k, v in output]
     self.assertEquals([], output)
Пример #6
0
    def test_repartition(self):
        items2 = Dampr.memory(list(range(10))) \
                .group_by(lambda x: -x) \
                    .reduce(lambda k, vs: sum(vs))

        output = self.items.group_by(lambda x: x) \
                .join(items2) \
                .run()

        output = [v for k, v in output]
        self.assertEquals([], output)
Пример #7
0
    def test_reduce_join(self):
        items2 = Dampr.memory(list(range(10)))
        res = self.items \
                .group_by(lambda x: x % 2) \
                .join(items2.group_by(lambda x: x % 2)) \
                    .reduce(lambda l, r: list(sorted(itertools.chain(l, r)))) \
                .run()

        output = list(res)
        self.assertEquals((0, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]), output[0])
        self.assertEquals((1, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]), output[1])
Пример #8
0
    def test_top_k(self):
        """
        Tests getting the top k items
        """

        word = Dampr.memory(["supercalifragilisticexpialidociousa"])
        topk = word.flat_map(lambda w: list(w)).count() \
                .topk(5, lambda x: x[1])

        results = sorted(list(topk.run()))
        self.assertEquals(results, [('a', 4), ('c', 3), ('i', 7), ('l', 3),
                                    ('s', 3)])
Пример #9
0
    def test_left_join(self):
        to_remove = Dampr.memory(list(range(10, 13)))

        output = self.items.group_by(lambda x: x) \
                .join(to_remove.group_by(lambda x: x)) \
                    .left_reduce(lambda l, r: (list(l), list(r))) \
                .filter(lambda llrs: len(llrs[1][1]) == 0) \
                .map(lambda llrs: llrs[1][0][0]) \
                .sort_by(lambda x: x) \
                .run()

        output = list(output)
        self.assertEquals(list(range(13, 20)), output)
Пример #10
0
    def test_blocks(self):
        """
        Tests Custom Blocks
        """
        from collections import defaultdict
        import heapq

        class TopKMapper(BlockMapper):
            def __init__(self, k):
                self.k = k

            def start(self):
                self.heap = []

            def add(self, _k, lc):
                heapq.heappush(self.heap, (lc[1], lc[0]))
                if len(self.heap) > self.k:
                    heapq.heappop(self.heap)

                return iter([])

            def finish(self):
                for cl in self.heap:
                    yield 1, cl

        class TopKReducer(BlockReducer):
            def __init__(self, k):
                self.k = k

            def start(self):
                pass

            def add(self, k, it):
                for count, letter in heapq.nlargest(self.k, it):
                    yield letter, (letter, count)

        word = Dampr.memory(["supercalifragilisticexpialidociousa"])
        letter_counts = word.flat_map(lambda w: list(w)).count()

        topk = letter_counts \
                .custom_mapper(TopKMapper(2)) \
                .custom_reducer(TopKReducer(2))

        results = sorted(list(topk.run()))
        self.assertEquals(results, [('a', 4), ('i', 7)])
Пример #11
0
    def union(self, keys):
        if not isinstance(keys, (list, tuple)):
            keys = [keys]

        paths = read_paths(self.path, self.suffix)

        query = """select distinct offset from key_index 
            where key in ({}) order by offset asc""".format(
                ','.join('"{}"'.format(key) for key in keys))

        def read_db(fname):
            db = self.open_db(fname)

            cur = db.cursor()
            cur.execute(query)
            with codecs.open(fname, encoding='utf-8') as f:
                for (offset,) in cur:
                    f.seek(offset)
                    yield f.readline()

        return Dampr.memory(paths).flat_map(read_db)
Пример #12
0
    def build(self, key_f, force=False):
        paths = list(read_paths(self.path, False))
        paths.sort()

        def index_file(fname):
            logging.debug("Indexing %s", fname)
            db = self.create_db(fname)
            def it():
                offset = 0
                with codecs.open(fname, encoding='utf-8') as f:
                    while True:
                        line = f.readline()
                        if len(line) == 0:
                            break

                        for key in key_f(line):
                            yield key, offset

                        offset += len(line.encode('utf-8'))

            c = db.cursor()
            c.executemany("INSERT INTO key_index values (?, ?)", it())
            db.commit()
            c.execute("create index key_idx on key_index (key)")
            db.commit()
            c.execute("select count(*) from key_index")
            count = c.fetchone()[0]
            logging.debug("Keys indexed for %s: %s", fname, count)
            
            return count

        return Dampr.memory(paths) \
                .filter(lambda fname: force or not self.exists(fname)) \
                .map(index_file) \
                .fold_by(key=lambda x: 1, binop=lambda x,y: x + y) \
                .read(name="indexing")
Пример #13
0
 def setUp(self):
     items = list(range(10, 20))
     self.items = Dampr.memory(items, partitions=2)