Пример #1
0
    def groupBy(self, keys, *fields, **kw):
        numSplits = kw.pop('numSplits', None)

        if not isinstance(keys, (list, tuple)):
            keys = [keys]
        key_names = [self._create_field_name(e) for e in keys]
        expr = ','.join(self._create_expression(e) for e in keys)
        gen_key = eval('lambda _v:(%s,)' % expr)

        values = [self._create_field_name(e) for e in fields] + list(kw.keys())
        kw.update((values[i], fields[i]) for i in range(len(fields)))
        codes = [self._create_reducer(i, kw[n]) for i, n in enumerate(values)]
        creater = eval('lambda _v:(%s,)' % (','.join(c[0] for c in codes)))
        merger = eval('lambda _x, _v:(%s,)' % (','.join(c[1] for c in codes)))
        combiner = eval('lambda _x, _y:(%s,)' % (','.join(c[2] for c in codes)))
        mapper = eval('lambda _x:(%s,)' % ','.join(c[3] for c in codes))

        agg = Aggregator(creater, merger, combiner)
        g = self.prev.map(lambda v: (gen_key(v), v)).combineByKey(agg, numSplits)
        return g.map(lambda k_v1: k_v1[0] + mapper(k_v1[1])).asTable(key_names + values, self.name)
Пример #2
0
 def combineByKey(self, createCombiner, mergeValue, mergeCombiner,
                  partitioner):
     agg = Aggregator(createCombiner, mergeValue, mergeCombiner)
     return ShuffledDStream(self, agg, partitioner)