def _test_concat(self): """ Tests concatenating K datasets into a new Dampr """ word1 = Dampr.memory("abcdefg") word1.concat(Dampr.memory("hijklmn")) results = sorted(list(word1.run())) self.assertEquals(results, list('abcdefghijklmn'))
def intersect(self, keys, min_match=None): if not isinstance(keys, (list, tuple)): keys = [keys] if min_match is None: min_match = len(keys) if isinstance(min_match, float): min_match = int(min_match * len(keys)) paths = read_paths(self.path, self.suffix) str_keys = u','.join(u'"{}"'.format(key) for key in keys) query = u""" select offset from (select offset, count(*) as c from key_index where key in ({}) group by offset) where c >= {} order by offset asc""".format(str_keys, min_match) def read_db(fname): db = self.open_db(fname) cur = db.cursor() cur.execute(query) with codecs.open(fname, encoding='utf-8') as f: for (offset,) in cur: f.seek(offset) yield f.readline() return Dampr.memory(paths).flat_map(read_db)
def test_len(self): """ Tests the number of items in a collection. """ self.assertEquals(self.items.len().read(), [10]) self.assertEquals(Dampr.memory([]).len().read(), [0])
def test_stream_blocks(self): """ Tests stream blocks """ import heapq def map_topk(it): heap = [] for symbol, count in it: heapq.heappush(heap, (count, symbol)) if len(heap) > 2: heapq.heappop(heap) return ((1, x) for x in heap) def reduce_topk(it): counts = (v for k, vit in it for v in vit) for count, symbol in heapq.nlargest(2, counts): yield symbol, count word = Dampr.memory(["supercalifragilisticexpialidociousa"]) letter_counts = word.flat_map(lambda w: list(w)).count() topk = letter_counts \ .partition_map(map_topk) \ .partition_reduce(reduce_topk) results = sorted(list(topk.run())) self.assertEquals(results, [('a', 4), ('i', 7)])
def test_disjoint(self): items2 = Dampr.memory(list(range(10))) \ .group_by(lambda x: -x) output = self.items.group_by(lambda x: x) \ .join(items2) \ .run() output = [v for k, v in output] self.assertEquals([], output)
def test_repartition(self): items2 = Dampr.memory(list(range(10))) \ .group_by(lambda x: -x) \ .reduce(lambda k, vs: sum(vs)) output = self.items.group_by(lambda x: x) \ .join(items2) \ .run() output = [v for k, v in output] self.assertEquals([], output)
def test_reduce_join(self): items2 = Dampr.memory(list(range(10))) res = self.items \ .group_by(lambda x: x % 2) \ .join(items2.group_by(lambda x: x % 2)) \ .reduce(lambda l, r: list(sorted(itertools.chain(l, r)))) \ .run() output = list(res) self.assertEquals((0, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]), output[0]) self.assertEquals((1, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]), output[1])
def test_top_k(self): """ Tests getting the top k items """ word = Dampr.memory(["supercalifragilisticexpialidociousa"]) topk = word.flat_map(lambda w: list(w)).count() \ .topk(5, lambda x: x[1]) results = sorted(list(topk.run())) self.assertEquals(results, [('a', 4), ('c', 3), ('i', 7), ('l', 3), ('s', 3)])
def test_left_join(self): to_remove = Dampr.memory(list(range(10, 13))) output = self.items.group_by(lambda x: x) \ .join(to_remove.group_by(lambda x: x)) \ .left_reduce(lambda l, r: (list(l), list(r))) \ .filter(lambda llrs: len(llrs[1][1]) == 0) \ .map(lambda llrs: llrs[1][0][0]) \ .sort_by(lambda x: x) \ .run() output = list(output) self.assertEquals(list(range(13, 20)), output)
def test_blocks(self): """ Tests Custom Blocks """ from collections import defaultdict import heapq class TopKMapper(BlockMapper): def __init__(self, k): self.k = k def start(self): self.heap = [] def add(self, _k, lc): heapq.heappush(self.heap, (lc[1], lc[0])) if len(self.heap) > self.k: heapq.heappop(self.heap) return iter([]) def finish(self): for cl in self.heap: yield 1, cl class TopKReducer(BlockReducer): def __init__(self, k): self.k = k def start(self): pass def add(self, k, it): for count, letter in heapq.nlargest(self.k, it): yield letter, (letter, count) word = Dampr.memory(["supercalifragilisticexpialidociousa"]) letter_counts = word.flat_map(lambda w: list(w)).count() topk = letter_counts \ .custom_mapper(TopKMapper(2)) \ .custom_reducer(TopKReducer(2)) results = sorted(list(topk.run())) self.assertEquals(results, [('a', 4), ('i', 7)])
def union(self, keys): if not isinstance(keys, (list, tuple)): keys = [keys] paths = read_paths(self.path, self.suffix) query = """select distinct offset from key_index where key in ({}) order by offset asc""".format( ','.join('"{}"'.format(key) for key in keys)) def read_db(fname): db = self.open_db(fname) cur = db.cursor() cur.execute(query) with codecs.open(fname, encoding='utf-8') as f: for (offset,) in cur: f.seek(offset) yield f.readline() return Dampr.memory(paths).flat_map(read_db)
def build(self, key_f, force=False): paths = list(read_paths(self.path, False)) paths.sort() def index_file(fname): logging.debug("Indexing %s", fname) db = self.create_db(fname) def it(): offset = 0 with codecs.open(fname, encoding='utf-8') as f: while True: line = f.readline() if len(line) == 0: break for key in key_f(line): yield key, offset offset += len(line.encode('utf-8')) c = db.cursor() c.executemany("INSERT INTO key_index values (?, ?)", it()) db.commit() c.execute("create index key_idx on key_index (key)") db.commit() c.execute("select count(*) from key_index") count = c.fetchone()[0] logging.debug("Keys indexed for %s: %s", fname, count) return count return Dampr.memory(paths) \ .filter(lambda fname: force or not self.exists(fname)) \ .map(index_file) \ .fold_by(key=lambda x: 1, binop=lambda x,y: x + y) \ .read(name="indexing")
def setUp(self): items = list(range(10, 20)) self.items = Dampr.memory(items, partitions=2)