示例#1
0
def group_by_every_record(pvalue, **options):
    """
    group by every record
    """

    pipeline = pvalue.pipeline()
    node = pvalue.node()
    plan = node.plan()
    scope = node.scope()
    shuffle = plan.shuffle(scope, [node])
    shuffle_node = shuffle.node(0).distribute_every()

    from bigflow import serde
    key_serde = serde.StrSerde()
    return ptable.PTable(pcollection.PCollection(shuffle_node, pipeline), key_serde=key_serde)
示例#2
0
    def test_optional_join(self):
        """ test """
        sp1 = self._pipeline.parallelize([('a', 1), ('b', 2)]) \
            .apply(schema.tuple_to_dict, [('websites', serde.StrSerde()),
                                          ('clicknum', serde.IntSerde())])
        sp2 = self._pipeline.parallelize([('a', 2), ('c', 3)]) \
            .apply(schema.tuple_to_dict, ['websites', 'clicknum']) \
            .apply(schema.group_by, ['websites']) \
            .apply_values(schema.agg, lambda record: {'websites': record['websites'],
                                               'clicknum': record['clicknum'].count()}) \
            .apply(schema.flatten)

        jsp = sp2.apply(schema.join, sp1, fields=['websites'])

        expect = [({
            'clicknum': 1,
            'websites': 'a'
        }, {
            'clicknum': 1,
            'websites': 'a'
        })]

        self.passertEqual(expect, jsp)

        jsp = sp2.apply(schema.left_join, sp1, fields=['websites'])

        expect = \
        [({'clicknum': 1, 'websites': 'a'}, {'clicknum': 1, 'websites': 'a'}),
         ({'clicknum': 1, 'websites': 'c'}, {'clicknum': None, 'websites': None})]

        self.passertEqual(expect, jsp)

        jsp = sp2.apply(schema.right_join, sp1, fields=['websites'])

        expect = [({
            'clicknum': None,
            'websites': None
        }, {
            'clicknum': 2,
            'websites': 'b'
        }), ({
            'clicknum': 1,
            'websites': 'a'
        }, {
            'clicknum': 1,
            'websites': 'a'
        })]

        self.passertEqual(expect, jsp)

        jsp = sp2.apply(schema.full_join, sp1, fields=['websites'])

        expect = [({
            'clicknum': None,
            'websites': None
        }, {
            'clicknum': 2,
            'websites': 'b'
        }), ({
            'clicknum': 1,
            'websites': 'a'
        }, {
            'clicknum': 1,
            'websites': 'a'
        }),
                  ({
                      'clicknum': 1,
                      'websites': 'c'
                  }, {
                      'clicknum': None,
                      'websites': None
                  })]

        self.passertEqual(expect, jsp)