Пример #1
0
def group_by(pcollection, fields, **options):
    """
	对pcollection按字段group by

    Args:
        pcollection (SchemaPCollection): 输入数据集SchemaPCollection(用来表示结构化的,带字段的PCollection), 可以当作每个元素是一个dict的PCollection来用
        fields (Iterable): 如果fields为一个str,则会按“,”进行切割,然后按切割出的字段进行分组。
                           如果fields为一个list/tuple,则直接按list中的多个字段进行分组

    Returns:
        SchemaPCollection: 每个key为group的字段所组成的一个dict,每个value是一个PCollection,包含所有的列。

    Examples:
        >>> from bigflow import base, schema
        >>> p = base.Pipeline.create('local')
        >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)])
        >>>     .apply(schema.tuple_to_dict, ['websites', 'clicknum'])
        >>>     .apply(schema.select, lambda cols: {
        >>>         'website': cols['websites'].flat_map(lambda line: line.split(',')),
        >>>         'clicknum': cols['clicknum']
        >>>     }).apply(schema.group_by, ['website'])
        >>>       .apply_values(transforms.first)
        >>>     .apply(schema.flatten)
        >>> print analytics.get()
       输出结果为:
        [{'website': 'a', 'clicknum': 1}, {'website': 'b', 'clicknum': 1}, {'website': 'c', 'clicknum': 1},
         {'website': 'd', 'clicknum': 1}]

    """
    pcollection = _transform_schema_pcollection(pcollection)

    if _is_tuple_serde(pcollection.serde()):

        def _select_fields(tup, fields):
            """ 内部函数 """
            return tuple(tup[field] for field in fields)

        from bigflow import schema_pcollection
        return pcollection.group_by(lambda record: _select_fields(record, fields),
            key_serde=serde.of(tuple(_origin_serde(pcollection.serde())[field]
            for field in fields)), **options)\
            .apply_values(lambda record: schema_pcollection.SchemaPCollection(record))

    def _select_fields(dct, fields):
        """ 内部函数 """
        return dict((field, dct[field]) for field in fields)

    if isinstance(fields, str):
        fields = ''.join(fields.split()).split(',')

    from bigflow import schema_pcollection
    return pcollection.group_by(lambda record: _select_fields(record, fields),
        key_serde=_get_serde_of_fields(pcollection.serde(), fields,
        pcollection.pipeline().default_objector()), **options) \
        .apply_values(lambda record: schema_pcollection.SchemaPCollection(record))
Пример #2
0
def _ret_dict_handler(ret_dict, record_val):
    """ 内部函数 """
    inter_type_keys = []
    inter_type_values = []
    inter_type_flag = False
    ptype_keys = []
    ptype_values = []
    ptype_flag = False

    for key, value in ret_dict.items():
        if isinstance(value, ptype.PType):
            ptype_keys.append((key, value.serde()))
            ptype_values.append(value)
            ptype_flag = True
        else:
            inter_type_keys.append((key, type(value)))
            inter_type_values.append(value)
            inter_type_flag = True

    if ptype_flag and inter_type_flag:
        ptype_keys.extend(inter_type_keys)
        return tuple_to_dict(transforms.cartesian(*ptype_values)\
            .apply(transforms.map, lambda record: record + tuple(inter_type_values)),
                ptype_keys)
    elif not ptype_flag and inter_type_flag:
        from bigflow import schema_pcollection
        return schema_pcollection.SchemaPCollection(
            record_val.apply(
                transforms.map,
                lambda record: dict(
                    zip(tuple(key_sd[0] for key_sd in inter_type_keys),
                        inter_type_values)),
                serde=of(dict(inter_type_keys))))
    else:
        return tuple_to_dict(transforms.cartesian(*ptype_values), ptype_keys)
Пример #3
0
def tuple_to_dict(pcollection, fields):
    """
    从每个元素是tuple的PCollection转化成一个SchemaPCollection。

    Args:
        pcollection (PCollection): 输入PCollection,每个元素是一个tuple
        fields (list): fields中每个元素是一个tuple(key, value), key是字段名,value是相应字段所用的serde,
        若fields中每个元素是一个str表示字段名, 则表示所有类型都是可被marshal序列化的类型

    Returns:
        SchemaPCollection: 用来表示结构化的,带字段的PCollection(FieldsDict),它拥有普通PCollection的所有操作

    """
    fields = _str_to_list(fields)
    from bigflow import schema_pcollection
    order_fields = []
    serde_fields = {}
    for field in fields:
        if isinstance(field, tuple):
            order_fields.append(field[0])
            serde_fields[field[0]] = field[1]
        else:
            order_fields.append(field)
            serde_fields[field] = serde._
    return schema_pcollection.SchemaPCollection(
        pcollection.map(lambda record: dict(zip(order_fields, record)),
                        serde=of(serde_fields)))
Пример #4
0
def agg(pcollection, agg_fn, *args):
    """
    把PCollection的全部数据进行聚合

    Args:
       pcollection (SchemaPCollection): 输入数据集SchemaPCollection(用来表示结构化的,带字段的PCollection), 可以当作每个元素是一个dict的PCollection来用
       agg_fn (callable): 一个函数,表示对每个字段进行怎么样的聚合,该函数将传入一个dict类型的参数,dict的key是所有的字段名,每个value是一个
                PCollection,表示该字段的全部数据。 用户需要返回一个dict,dict的key是要输出的字段,value是一个分布式数据集(PCollection
                或PObject), 表示相关字段下的数据。 最终多个字段下的多个数据集进行笛卡尔积,拼接成最终返回的数据集。
       *args (object):  变换所需要的参数列表

    Returns:
        SchemaPCollection: 返回一个每个元素是一个dict的pcollection, 其中所有元素输出的几个pcollection进行笛卡尔积并添加字段名后的结果

    Examples:
        >>> from bigflow import base, schema
        >>> p = base.Pipeline.create('local')
        >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)])
        >>>     .apply(schema.tuple_to_dict, ['websites', 'clicknum'])
        >>>     .apply(schema.select, lambda cols: {
        >>>         'website': cols['websites'].flat_map(lambda line: line.split(',')),
        >>>         'clicknum': cols['clicknum']
        >>>     }).apply(schema.group_by, ['website'])
        >>> .apply_values(schema.agg, lambda cols: {
    	>>> 'max_click_num': cols['clicknum'].max(),
    	>>> 'sum_click_num': cols['clicknum'].sum(),
    	>>> 'avg_click_num': cols['clicknum'].sum() / cols['clicknum'].count()
    	>>> }).apply(schema.flatten)
        >>> print analytics.get()
    	输出结果为:
        [{'sum_click_num': 7, 'website': 'a', 'avg_click_num': 1, 'max_click_num': 3},
    	{'sum_click_num': 6, 'website': 'c', 'avg_click_num': 2, 'max_click_num': 3},
    	{'sum_click_num': 5, 'website': 'b', 'avg_click_num': 1, 'max_click_num': 2},
    	{'sum_click_num': 1, 'website': 'd', 'avg_click_num': 1, 'max_click_num': 1}]

    """
    pcollection = _transform_schema_pcollection(pcollection)

    if _is_tuple_serde(pcollection.serde()):

        def _pack_udf():
            """ 内部函数 """
            tp = list(pcollection.apply(_select_cols_tp))
            tp.extend(args)
            ret_tuple = agg_fn(*tp)
            return _ret_tuple_handler(ret_tuple, tp[0])

        from bigflow import schema_pcollection
        return schema_pcollection.SchemaPCollection(_pack_udf())

    def _pack_udf(val, *args):
        """ 内部函数 """
        record_val = val[-1]
        ret_dict = agg_fn(dict(zip(agg_fields, val[:-1])), *args)
        return _ret_dict_handler(ret_dict, record_val)

    agg_fields = pcollection._get_fields()

    return _pack_udf(pcollection.apply(_select_cols, agg_fields), *args)
Пример #5
0
def full_join(*pcollections, **options):
    """
    对多个输入SchemaPCollection,根据指定的字段(dict)对SchemaPCollection做内连接操作,
    连接结果为(fields, (value1, value2, ..., value n)), 若第m个SchemaPCollection没有元素,
    则value m为None

    Args:
        *pcollection (SchemaPCollection): 输入的多个SchemaPCollection
        **options: 配置选项,需要配置fields=str/tuple/list(一个列表、元组、一个逗号分隔的字符串,表示指定的字段)

    Returns:
        SchemaPCollection: 连接结果

    Examples:
        >>> from bigflow import base, schema, transforms
        >>> p = base.Pipeline.create('local')
        >>> p1 = p.parallelize([('a', 2), ('e', 4), ('c', 6)])
		>>> sp1 = p1.apply(schema.tuple_to_dict, ['websites', 'clicknum'])
        >>> p2 = p.parallelize([('a', 9), ('b', 8), ('d', 7)])
		>>> sp2 = p2.apply(schema.tuple_to_dict, ['websites', 'click'])
		>>> csp = sp1.apply(schema.full_join, sp2, fields=['websites'])
		>>> csp.get()
		输出结果为:
        [({'clicknum': 4, 'websites': 'e'}, {'click': None, 'websites': None}),
         ({'clicknum': None, 'websites': None}, {'click': 8, 'websites': 'b'}),
         ({'clicknum': None, 'websites': None}, {'click': 7, 'websites': 'd'}),
         ({'clicknum': 2, 'websites': 'a'}, {'click': 9, 'websites': 'a'}),
         ({'clicknum': 6, 'websites': 'c'}, {'click': None, 'websites': None})]

    """
    if len(pcollections) < 2:
        raise ValueError("require at least 2 pcollections")

    fields = options.get('fields', None)
    from bigflow import schema_pcollection
    pc = transforms.full_join(*_check_set_args(fields, pcollections))
    none_dict = _get_none_dict(pcollections)
    ret = schema_pcollection.SchemaPCollection(
        pc.apply(transforms.map,
                 lambda tp: _none_to_dict(tp, none_dict),
                 serde=_value_serde(pc.serde())))

    # default: merge = False, don't merge the result
    merge = options.get("merge", False)
    if not merge:
        return ret
    else:
        return _merge_result_after_join(ret, "full_join")
Пример #6
0
 def _trans_to_sp(*records):
     """ 内部函数 """
     from bigflow import schema_pcollection
     return tuple(
         schema_pcollection.SchemaPCollection(record) for record in records)
Пример #7
0
def flatten(ptype):
    """
	把PTable的所有value拼接上key,并打平成一个SchemaPCollection

    Args:
		ptype (PTable): 输入数据集,需要是一个PTable,key,value都必须为字典

    Returns:
        SchemaPCollection: 返回的SchemaPCollection拥有key,value所有的字段,如果key,value中有相同字段,则以value为准

    Examples:
        >>> from bigflow import base, schema
        >>> p = base.Pipeline.create('local')
        >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)])
        >>>     .apply(schema.tuple_to_dict, ['websites', 'clicknum'])
        >>>     .apply(schema.select, lambda cols: {
        >>>         'website': cols['websites'].flat_map(lambda line: line.split(',')),
        >>>         'clicknum': cols['clicknum']
        >>>     }).apply(schema.group_by, ['website'])
        >>>     .apply(schema.flatten)
        >>> print analytics.get()
    	输出结果为:
    	[{'website': 'a', 'clicknum': 1}, {'website': 'b', 'clicknum': 1}, {'website': 'c', 'clicknum': 1},
    	 {'website': 'b', 'clicknum': 2}, {'website': 'c', 'clicknum': 2}, {'website': 'a', 'clicknum': 3},
    	 {'website': 'c', 'clicknum': 3}, {'website': 'a', 'clicknum': 2}, {'website': 'b', 'clicknum': 2},
    	 {'website': 'a', 'clicknum': 1}, {'website': 'd', 'clicknum': 1}]
    """
    def _new_dict(*dicts):
        """ 内部函数 """
        ret = {}
        for dct in dicts:
            ret.update(dct)
        return ret

    def _merge_kv(tp, level):
        """ 内部函数 """
        kvs = []
        for i in xrange(level):
            kvs.append(tp[0])
            tp = tp[1]
        kvs.append(tp)
        return kvs

    def _merge_serde(serdes, dft=None):
        """ 内部函数 """
        ret = {}
        for sd in serdes:
            if not isinstance(serde.origin(sd), FieldsDictSerde):
                return dft
            for field, field_serde in serde.origin(
                    sd)._fields_to_types.iteritems():
                ret[field] = field_serde
        return of(ret)

    if not isinstance(ptype, ptable.PTable):
        raise ValueError("flatten should only be applied on PTable")

    level = ptype.nested_level() + 1
    from bigflow import schema_pcollection
    return schema_pcollection.SchemaPCollection(ptype.flatten().map(
        lambda kv: _new_dict(*_merge_kv(kv, level)),
        serde=_merge_serde(ptype.key_serdes() + [ptype.serde()],
                           ptype.pipeline().default_objector())))
Пример #8
0
def select(pcollection, select_fn, *args):
    """
    对每条数据选择一些字段进行变换

    Args:
        pcollection (SchemaPCollection): 输入数据集SchemaPCollection(用来表示结构化的, 带字段的PCollection), 可以当作每个元素是一个dict的PCollection来用
        select_fn (callable): 一个函数,表示每条记录要进行什么样的变换,该函数将传入一个dict类型的参数,dict的key是所有的字段名,
                    每个value是一个PObject,表示本条记录中该字段的数据。 用户需要返回一个dict,dict的key是要输出的字段,
                    value是一个分布式数据集(PCollection或PObject), 表示相关字段下的数据。 最终多个字段下的多个数据集进
                    行笛卡尔积,拼接成最终返回的数据集。
        *args (object):  变换所需要的参数列表

    Returns:
        SchemaPCollection: 每个元素是一个dict的pcollection,
            这个SchemaPCollection中所有元素相当于对原数据每条数据进行一次select_fn处理,
            处理后返回的tuple中的所有数据集进行笛卡尔积,
            最终再把所有输入数据处理后得出的结果拼成一个数据集。

    Examples:
        >>> from bigflow import base, schema
        >>> p = base.Pipeline.create('local')
        >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)])
        >>>     .apply(schema.tuple_to_dict, ['websites', 'clicknum'])
        >>>     .apply(schema.select, lambda cols: {
        >>>         'website': cols['websites'].flat_map(lambda line: line.split(',')),
        >>>         'clicknum': cols['clicknum']
        >>>     })
        >>> print analytics.get()
    	输出结果为
    	[{'website': 'a', 'clicknum': 1}, {'website': 'b', 'clicknum': 1}, {'website': 'c', 'clicknum': 1},
    	 {'website': 'b', 'clicknum': 2}, {'website': 'c', 'clicknum': 2}, {'website': 'a', 'clicknum': 3},
    	 {'website': 'c', 'clicknum': 3}, {'website': 'a', 'clicknum': 2}, {'website': 'b', 'clicknum': 2},
    	 {'website': 'a', 'clicknum': 1}, {'website': 'd', 'clicknum': 1}]

        >>> from bigflow import schema
        >>> from bigflow import transforms
        >>> from bigflow import base
        >>> pl = base.Pipeline.create("local")
        >>> raw_data = [["xiaoming", "school_1", 12, 150, 90], ]
        >>> data = pl.parallelize(raw_data) \
        >>>         .apply(schema.tuple_to_dict,
        >>>             [("name", str),
        >>>              ("school", str),
        >>>              ("age", int),
        >>>              ("height", int),
        >>>              ("weight", int)])
        >>> fields = {
        >>>     # 复用bigflow提供的transforms:传入一个tuple,提供transforms及自定义函数
        >>>     "name": (transforms.map, lambda name: "My name is " + name),
        >>>     # 提供变化函数:传入一个function
        >>>     "school": lambda school: "My school is " + school,
        >>> }
        >>> output = schema.select(data, fields)
        >>> print output.get()
        输出结果为
        [{'school': 'My school is school_1', 'name': 'My name is xiaoming'}]
    """

    if isinstance(select_fn, list) or isinstance(
            select_fn, tuple) or isinstance(select_fn, str):
        cols = _str_to_list(select_fn)
        return pcollection.map(
            lambda record: {col: record.get(col)
                            for col in cols},
            serde=of({col: serde._
                      for col in cols}))

    pcollection = _transform_schema_pcollection(pcollection)

    if isinstance(select_fn, dict):

        def wrapper_of_udf(select_fields):
            fields_fn = select_fields

            def _apply_udf_in_cols(cols, *args):
                """ only return columns which user select """
                result = {}
                for field, tf in fields_fn.items():
                    if callable(tf):
                        result[field] = cols[field].apply(tf)
                    elif isinstance(tf, tuple):
                        result[field] = cols[field].apply(*tf)
                    else:
                        result[field] = tf
                return result

            return _apply_udf_in_cols

        return select(pcollection, wrapper_of_udf(select_fn))

    if _is_tuple_serde(pcollection.serde()):

        def _pack_udf(*val):
            """ 内部函数 """
            ret_tuple = select_fn(*val)
            return _ret_tuple_handler(ret_tuple, val[0])

        from bigflow import schema_pcollection
        return schema_pcollection.SchemaPCollection(
            pcollection.apply(
                group_by_every_record.group_by_every_record).apply_values(
                    transforms.first).apply_values(
                        _select_cols_tp).apply_values(_pack_udf,
                                                      *args).flatten_values())

    def _pack_udf(*val):
        """ 内部函数 """
        sep_postion = len(select_fields) + 1
        record_val = val[sep_postion - 1]
        ret_dict = {}
        if len(val) > sep_postion:
            shard_pts = val[sep_postion:]
            ret_dict = select_fn(
                dict(zip(select_fields, val[0:sep_postion - 1])), *shard_pts)
        else:
            ret_dict = select_fn(
                dict(zip(select_fields, val[0:sep_postion - 1])))
        return _ret_dict_handler(ret_dict, record_val)

    select_fields = pcollection._get_fields()

    from bigflow import schema_pcollection
    return schema_pcollection.SchemaPCollection(
        pcollection.apply(
            group_by_every_record.group_by_every_record).apply_values(
                transforms.first).apply_values(_select_cols,
                                               select_fields).apply_values(
                                                   _pack_udf,
                                                   *args).flatten_values())
Пример #9
0
def _transform_schema_pcollection(pcollection):
    """ 内部函数 """
    from bigflow import schema_pcollection
    return schema_pcollection.SchemaPCollection(pcollection)
Пример #10
0
    def transform_from_node(self, load_node, pipeline):
        """
        内部接口
        """
        from bigflow import schema
        if self.fields is None:
            raise ValueError('''columns is necessary,(1) columns(list),
                each item in columns is string, SchemaPCollection's element
                is dict, (2) columns(int),SchemaPCollection's element is tuple. eg.
                columns=3 or columns=[(xx, int), (yy, str)] or columns=[xx, yy],
                (3) columns(list), each item in columns is base type in [int, float, str]'''
                             )

        if isinstance(self.fields, tuple):
            self.fields = list(self.fields)

        fields_type = []
        ignore_overflow = self.ignore_overflow
        ignore_illegal_line = self.ignore_illegal_line
        if isinstance(self.fields, list):

            def get_fields_type(fields):
                """内部函数"""
                fields_type = []
                for field in fields:
                    if isinstance(field, tuple):
                        if field[1] in [int, str, float]:
                            fields_type.append(field[1])
                        else:
                            raise ValueError(
                                '''columns is list(field name or data type),
                                             data type(int/str/float)''')
                    elif field in [int, str, float]:
                        fields_type.append(field)
                    elif isinstance(field, str):
                        fields_type.append(str)
                    else:
                        raise ValueError(
                            '''columns is list(field name or data type),
                                         data type(int/str/float)''')
                return fields_type

            fields_type = get_fields_type(self.fields)
            ret = super(SchemaTextFile, self)\
                    .transform_from_node(load_node, pipeline)\
                    .flat_map(entity.SplitStringToTypes(self.sep,
                                                        fields_type,
                                                        ignore_overflow,
                                                        ignore_illegal_line),
                                                        serde=serde.of(tuple(fields_type)))
            if self.fields[0] in [int, float, str]:
                return ret
            else:
                ret = ret.apply(schema.tuple_to_dict, self.fields)
                return ret
        elif isinstance(self.fields, int):
            from bigflow import schema_pcollection
            return schema_pcollection.SchemaPCollection(super(SchemaTextFile, self)
                .transform_from_node(load_node, pipeline)\
                .flat_map(entity.SplitStringToTypes(self.sep,
                                                    [str for _ in xrange(self.fields)],
                                                    True,
                                                    ignore_illegal_line),
                          serde=serde.of(tuple(serde.StrSerde() for index in xrange(self.fields)))))
        else:
            raise ValueError("columns is list(field name),or int(row number)")