def __iter__(self): if not self.archives: return six.iteritems(self.combined) items = self.combined.items() items.sort(key=itemgetter(0)) combined = items self.archives.append(iter(combined)) iters = list(map(iter, self.archives)) if self.rddconf.is_groupby and self.rddconf.iter_group: heap = HeapOnKey(key=lambda x: x[0], min_heap=True) it = GroupByNestedIter(heap.merge(iters), "") else: it = heap_merged(iters, self._get_merge_function()) return it
def test_merge(self): N = 100 n = 13 a = list(range(N)) random.shuffle(a) a = list(enumerate(a)) b = a lsts = [] while len(b): lsts.append(b[:n]) b = b[n:] key = lambda x: x[1] lsts = list(map(lambda x: sorted(x, key=key), lsts)) # pprint(lsts) h = HeapOnKey(key=key, min_heap=True) r = list(h.merge(lsts)) exp = sorted(a, key=key) # pprint(exp) # pprint(r) assert r == exp
def _merge_sorted(self, iters): heap = HeapOnKey(key=lambda x: x[0], min_heap=True) return GroupByNestedIter(heap.merge(iters), self.call_site)
def _merge_sorted(self, iters): heap = HeapOnKey(key=lambda x: x[0], min_heap=True) merged = heap.merge(iters) return self.aggregator.aggregate_sorted(merged)
def merge(self, iters): heap = HeapOnKey(key=lambda x: x[0], min_heap=True) self.combined = GroupByNestedIter(heap.merge(iters), self.rdd_name)
def merge(self, iters): heap = HeapOnKey(key=lambda x: x[0], min_heap=True) self.combined = self.aggregator.aggregate_sorted(heap.merge(iters))