def group_by_every_record(pvalue, **options): """ group by every record """ pipeline = pvalue.pipeline() node = pvalue.node() plan = node.plan() scope = node.scope() shuffle = plan.shuffle(scope, [node]) shuffle_node = shuffle.node(0).distribute_every() from bigflow import serde key_serde = serde.StrSerde() return ptable.PTable(pcollection.PCollection(shuffle_node, pipeline), key_serde=key_serde)
def test_optional_join(self): """ test """ sp1 = self._pipeline.parallelize([('a', 1), ('b', 2)]) \ .apply(schema.tuple_to_dict, [('websites', serde.StrSerde()), ('clicknum', serde.IntSerde())]) sp2 = self._pipeline.parallelize([('a', 2), ('c', 3)]) \ .apply(schema.tuple_to_dict, ['websites', 'clicknum']) \ .apply(schema.group_by, ['websites']) \ .apply_values(schema.agg, lambda record: {'websites': record['websites'], 'clicknum': record['clicknum'].count()}) \ .apply(schema.flatten) jsp = sp2.apply(schema.join, sp1, fields=['websites']) expect = [({ 'clicknum': 1, 'websites': 'a' }, { 'clicknum': 1, 'websites': 'a' })] self.passertEqual(expect, jsp) jsp = sp2.apply(schema.left_join, sp1, fields=['websites']) expect = \ [({'clicknum': 1, 'websites': 'a'}, {'clicknum': 1, 'websites': 'a'}), ({'clicknum': 1, 'websites': 'c'}, {'clicknum': None, 'websites': None})] self.passertEqual(expect, jsp) jsp = sp2.apply(schema.right_join, sp1, fields=['websites']) expect = [({ 'clicknum': None, 'websites': None }, { 'clicknum': 2, 'websites': 'b' }), ({ 'clicknum': 1, 'websites': 'a' }, { 'clicknum': 1, 'websites': 'a' })] self.passertEqual(expect, jsp) jsp = sp2.apply(schema.full_join, sp1, fields=['websites']) expect = [({ 'clicknum': None, 'websites': None }, { 'clicknum': 2, 'websites': 'b' }), ({ 'clicknum': 1, 'websites': 'a' }, { 'clicknum': 1, 'websites': 'a' }), ({ 'clicknum': 1, 'websites': 'c' }, { 'clicknum': None, 'websites': None })] self.passertEqual(expect, jsp)