示例#1
0
    def test_key_value_serde(self):
        """ inner """
        self.serde_eq(int, serde._key_serde(serde.of([int, str]), None))
        self.serde_eq(str, serde._key_serde(serde.of((str, int)), None))

        self.serde_eq(int, serde._value_serde(serde.of((str, int)), None))
        self.serde_eq(int, serde._value_serde(serde.of([str, int]), None))
示例#2
0
    def test_key_value_serde(self):
        """ inner """
        self.serde_eq(int, serde._key_serde(serde.of([int, str]), None))
        self.serde_eq(str, serde._key_serde(serde.of((str, int)), None))

        self.serde_eq(int, serde._value_serde(serde.of((str, int)), None))
        self.serde_eq(int, serde._value_serde(serde.of([str, int]), None))
示例#3
0
def cogroup(*pcollections, **kargs):
    from bigflow import serde
    """ inner function"""

    if len(pcollections) == 0:
        raise ValueError("No argument")

    pipeline = pcollections[0].pipeline()
    key_serde = kargs.get('key_serde', None)
    if key_serde is None:
        key_serde = serde._key_serde(pcollections[0].serde(),
                                     pipeline.default_objector())

    value_serdes = kargs.get('value_serdes', None)

    if value_serdes is None:
        value_serdes = []
        for p in pcollections:
            value_serdes.append(
                serde._value_serde(p.serde(), pipeline.default_objector()))

    def _make_shuffle(node, value_serde):
        return pcollection.PCollection(node.match_by(KeyReader(None, key_serde)), pipeline)\
            .map(entity.ExtractValueFn(), serde=value_serde, scale=0.8)

    if not all(isinstance(p, pcollection.PCollection) for p in pcollections):
        raise ValueError("cogroup only applied on PCollections")

    plan = pcollections[0].node().plan()
    scope = pcollections[0].node().scope()

    nodes = map(lambda p: p.node(), pcollections)
    shuffle = plan.shuffle(scope, nodes)

    if 'concurrency' in kargs:
        concurrency = kargs['concurrency']
        shuffle.with_concurrency(concurrency)
    elif pipeline.estimate_concurrency:
        concurrency = sum(node.size()
                          for node in nodes) / pipeline.size_per_concurrency
        shuffle.with_concurrency(concurrency)

    results = []
    for i in range(len(nodes)):
        results.append(_make_shuffle(shuffle.node(i), value_serdes[i]))

    return ptable.PTable(tuple(results), key_serde=key_serde)
示例#4
0
def cogroup(*pcollections, **kargs):
    from bigflow import serde
    """ inner function"""

    if len(pcollections) == 0:
        raise ValueError("No argument")

    pipeline = pcollections[0].pipeline()
    key_serde = kargs.get('key_serde', None)
    if key_serde is None:
        key_serde = serde._key_serde(pcollections[0].serde(), pipeline.default_objector())

    value_serdes = kargs.get('value_serdes', None)

    if value_serdes is None:
        value_serdes = []
        for p in pcollections:
            value_serdes.append(serde._value_serde(p.serde(), pipeline.default_objector()))

    def _make_shuffle(node, value_serde):
        return pcollection.PCollection(node.match_by(KeyReader(None, key_serde)), pipeline)\
            .map(entity.ExtractValueFn(), serde=value_serde, scale=0.8)

    if not all(isinstance(p, pcollection.PCollection) for p in pcollections):
        raise ValueError("cogroup only applied on PCollections")

    plan = pcollections[0].node().plan()
    scope = pcollections[0].node().scope()

    nodes = map(lambda p: p.node(), pcollections)
    shuffle = plan.shuffle(scope, nodes)

    if 'concurrency' in kargs:
        concurrency = kargs['concurrency']
        shuffle.with_concurrency(concurrency)
    elif pipeline.estimate_concurrency:
        concurrency = sum(node.size() for node in nodes) / pipeline.size_per_concurrency
        shuffle.with_concurrency(concurrency)

    results = []
    for i in range(len(nodes)):
        results.append(_make_shuffle(shuffle.node(i), value_serdes[i]))

    return ptable.PTable(tuple(results), key_serde = key_serde)
示例#5
0
def group_by_key(pcollection, **options):
    """
    利用给定的PCollection,使用一个默认的key/value提取函数对输入的PCollection分组
    ,返回一个表示分组的PTable

    Args:
      pcollection (PCollection):  输入PCollection
      **options:  可配置选项

    Returns:
      PTable:  分组结果

    >>> _p = _pipeline.parallelize([("A", 4), ("A", 3), ("B", 2), ("A", 1)])
    >>> transforms.group_by_key(_p).get()
    {"A": [4, 3, 1], "B": [2]}

    """
    import bigflow.transform_impls.functions
    from bigflow import serde

    pipeline = pcollection.pipeline()
    key_serde = options.get('key_serde', None)
    value_serde = options.get('value_serde', None)

    if key_serde is None:
        key_serde = serde._key_serde(pcollection.serde(),
                                     pipeline.default_objector())
    if value_serde is None:
        value_serde = serde._value_serde(pcollection.serde(),
                                         pipeline.default_objector())

    options['key_serde'] = key_serde
    options['value_serde'] = value_serde

    # if key_extractor is None, will use the default key extractor
    return group_by(pcollection,
                    key_extractor=None,
                    value_extractor=bigflow.core.entity.ExtractValueFn(),
                    **options)
示例#6
0
def group_by_key(pcollection, **options):
    """
    利用给定的PCollection,使用一个默认的key/value提取函数对输入的PCollection分组
    ,返回一个表示分组的PTable

    Args:
      pcollection (PCollection):  输入PCollection
      **options:  可配置选项

    Returns:
      PTable:  分组结果

    >>> _p = _pipeline.parallelize([("A", 4), ("A", 3), ("B", 2), ("A", 1)])
    >>> transforms.group_by_key(_p).get()
    {"A": [4, 3, 1], "B": [2]}

    """
    import bigflow.transform_impls.functions
    from bigflow import serde

    pipeline = pcollection.pipeline()
    key_serde = options.get('key_serde', None)
    value_serde = options.get('value_serde', None)

    if key_serde is None:
        key_serde = serde._key_serde(pcollection.serde(), pipeline.default_objector())
    if value_serde is None:
        value_serde = serde._value_serde(pcollection.serde(), pipeline.default_objector())

    options['key_serde'] = key_serde
    options['value_serde'] = value_serde

    # if key_extractor is None, will use the default key extractor
    return group_by(pcollection,
                    key_extractor=None,
                    value_extractor=bigflow.core.entity.ExtractValueFn(),
                    **options
                    )