def test_key_value_serde(self): """ inner """ self.serde_eq(int, serde._key_serde(serde.of([int, str]), None)) self.serde_eq(str, serde._key_serde(serde.of((str, int)), None)) self.serde_eq(int, serde._value_serde(serde.of((str, int)), None)) self.serde_eq(int, serde._value_serde(serde.of([str, int]), None))
def _ret_tuple_handler(ret_tuple, record_val): """ 内部函数 """ inter_type_sds = [] inter_type_values = [] inter_type_flag = False ptype_sds = [] ptype_values = [] ptype_flag = False for item in ret_tuple: if isinstance(item, ptype.PType): ptype_sds.append((item.serde())) ptype_values.append(item) ptype_flag = True else: inter_type_sds.append((type(item))) inter_type_values.append(item) inter_type_flag = True if ptype_flag and inter_type_flag: ptype_sds.extend(inter_type_sds) return transforms.cartesian(*ptype_values)\ .apply(transforms.map, lambda record: record + tuple(inter_type_values), serde=serde.of(tuple(ptype_sds))) elif not ptype_flag and inter_type_flag: return record_val.apply(transforms.map, lambda record: tuple(inter_type_values), serde=serde.of(tuple(inter_type_sds))) else: return transforms.cartesian(*ptype_values)
def test_intersection(self): a = self._pipeline.parallelize([1, 2, 3, 1, 4]).map(lambda x: x, serde = serde.of(int)) b = self._pipeline.parallelize([1, 2, 1, 2, 3]).map(lambda x: x, serde = serde.of(int)) diff = a.diff(b) diff_serde_str = str(diff.serde()) expect_serde_str = str(serde.of((int, (int, int)))) self.assertEqual(expect_serde_str, diff_serde_str) self.assertItemsEqual([(2, (1, 2)), (4, (1, 0))], diff.get())
def _sort_str(pvalue, reverse=False): class ReverseStrSerde(serde.Serde): """ test """ def serialize(self, obj): """ inner """ return ''.join(chr(255 - ord(ch)) for ch in obj) def deserialize(self, buf): """ inner """ return ''.join(chr(255 - ord(ch)) for ch in buf) def _serde_to_string(serde): return entity.Entity.of(entity.Entity.objector, serde) \ .to_proto_message().SerializeToString() str_serde = None if not reverse: str_serde = serde.of(str) else: str_serde = serde.Optional(ReverseStrSerde()) class _StrSortKeyReader(object): def __init__(self, serde): self.objector = _serde_to_string(serde) self.read_key = lambda x: x class SetValueNoneProcessor(entity.Processor): """ inner """ def __init__(self): super(SetValueNoneProcessor, self).__init__() class SetKeyToValueProcessor(entity.Processor): """ inner """ def __init__(self, serde): super(SetKeyToValueProcessor, self).__init__() self.set_config(_serde_to_string(serde)) key_reader_obj = _StrSortKeyReader(str_serde) result_node = pvalue.node()._plan.shuffle(pvalue.node()._scope, from_nodes=[pvalue.node()]) \ .sort() \ .node(0).match_by(key_reader_obj, entity.Entity.key_reader) \ .set_debug_info("Sort: " + repr(key_reader_obj)) \ .process_by(SetValueNoneProcessor())\ .as_type(serde.of(str)) \ .set_effective_key_num(0) \ .ignore_group() \ .input(0).allow_partial_processing().done() \ .process_by(SetKeyToValueProcessor(str_serde)) \ .as_type(serde.of(str)) \ .leave_scope() return bigflow.pcollection.PCollection(result_node, pvalue.pipeline())
def _optional_sd(sd_val): """ 内部函数 """ new_serde = [] for fd in sd_val.get_args(): new_fd = {} if isinstance(fd, serde.Optional): fd = fd.origin_serde() for field, tp in fd.get_fields_to_types().items(): new_fd[field] = serde.Optional(serde.of(tp)) new_serde.append(of(new_fd)) return serde.of(tuple(new_serde))
def diff(a, b): """ Implementation of transforms.diff() """ if utils.is_infinite(a) or utils.is_infinite(b): raise ValueError("diff not supported infinite PType") def filter_count_ne(a, b): return a.count() \ .flat_map(lambda c1, c2: [(c1, c2)], b.count(), serde = serde.of((int, int))) \ .filter(lambda tp: tp[0] != tp[1]) a = a.map(lambda x: (x, None), serde = serde.tuple_of(a.serde(), serde.of(int))) b = b.map(lambda x: (x, None), serde = serde.tuple_of(b.serde(), serde.of(int))) return a.cogroup(b).apply_values(filter_count_ne).flatten()
def group_by(pcollection, fields, **options): """ 对pcollection按字段group by Args: pcollection (SchemaPCollection): 输入数据集SchemaPCollection(用来表示结构化的,带字段的PCollection), 可以当作每个元素是一个dict的PCollection来用 fields (Iterable): 如果fields为一个str,则会按“,”进行切割,然后按切割出的字段进行分组。 如果fields为一个list/tuple,则直接按list中的多个字段进行分组 Returns: SchemaPCollection: 每个key为group的字段所组成的一个dict,每个value是一个PCollection,包含所有的列。 Examples: >>> from bigflow import base, schema >>> p = base.Pipeline.create('local') >>> analytics = p.parallelize([('a,b,c', 1), ('b,c', 2), ('a,c', 3), ('a,b', 2), ('a,d', 1)]) >>> .apply(schema.tuple_to_dict, ['websites', 'clicknum']) >>> .apply(schema.select, lambda cols: { >>> 'website': cols['websites'].flat_map(lambda line: line.split(',')), >>> 'clicknum': cols['clicknum'] >>> }).apply(schema.group_by, ['website']) >>> .apply_values(transforms.first) >>> .apply(schema.flatten) >>> print analytics.get() 输出结果为: [{'website': 'a', 'clicknum': 1}, {'website': 'b', 'clicknum': 1}, {'website': 'c', 'clicknum': 1}, {'website': 'd', 'clicknum': 1}] """ pcollection = _transform_schema_pcollection(pcollection) if _is_tuple_serde(pcollection.serde()): def _select_fields(tup, fields): """ 内部函数 """ return tuple(tup[field] for field in fields) from bigflow import schema_pcollection return pcollection.group_by(lambda record: _select_fields(record, fields), key_serde=serde.of(tuple(_origin_serde(pcollection.serde())[field] for field in fields)), **options)\ .apply_values(lambda record: schema_pcollection.SchemaPCollection(record)) def _select_fields(dct, fields): """ 内部函数 """ return dict((field, dct[field]) for field in fields) if isinstance(fields, str): fields = ''.join(fields.split()).split(',') from bigflow import schema_pcollection return pcollection.group_by(lambda record: _select_fields(record, fields), key_serde=_get_serde_of_fields(pcollection.serde(), fields, pcollection.pipeline().default_objector()), **options) \ .apply_values(lambda record: schema_pcollection.SchemaPCollection(record))
def check(self, sd, value): """ inner """ sd = serde.of(sd) self.assertEqual(value, sd.deserialize(str(sd.serialize(value)))) import marshal import sys try: assert value == marshal.loads(marshal.dumps(value)) except: print >>sys.stderr, 'skip an unsupported serde', str(sd) else: self._checking_condition.append((sd, value))
def as_schema(self, fields): """ 根据字段,返回一个SchemaPCollection Args: fields: 类型可以是,tuple,list,dict; 当fields是tuple或list时, 会判断每个元素的类型: fields中的每个元素是python基本类型或一个serde; 接口将构造TupleSerde设置到PCollection每个元素 fields中的每个元素是python string,抛出异常 当fields是dict时: fields的key标识字段类型,value标识该字段的类型,如 {"name": str, "age": int} 当前PCollection中的每个元素必须是dict,dict内的key必须相同。 fields内的key要和PCollection内的key必须相同 Returns: PCollection: 表示转化后的PCollection Examples: >>> data = self._pipeline.parallelize([("xiaoming", "PKU", 20)]) >>> d1 = data.as_schema((str, str, int)) >>> d2 = data.as_schema([str, str, int]) >>> print d1.get() [('xiaoming', 'PKU', 20)] >>> >>> print d2.get() [('xiaoming', 'PKU', 20)] >>> >>> data = self._pipeline.parallelize([{"name": "xiaoming", "school": "PKU", "age": 20}]) >>> d5 = data.as_schema({"name": str, "school": str, "age": int}) >>> print d5.get() [{'age': 20, 'name': 'xiaoming', 'school': 'PKU'}] >>> """ from bigflow import schema from bigflow import serde if isinstance(fields, tuple) or isinstance(fields, list): if len(fields) == 0: raise ValueError("the number of elems in fields is zero.") if isinstance(fields[0], str): _fields = {field: self._pipeline.default_objector() for field in fields} return self.map(lambda x: x, serde = schema.of(_fields)) else: _fields = tuple(fields) return self.map(lambda x: x, serde = serde.of(_fields)) elif isinstance(fields, dict): return self.map(lambda x: x, serde = schema.of(fields)) else: raise ValueError("fields type only accept {`tuple`, `list`, `dict`}.")
def test_output_sort(self): self.setConfig(spark_conf={ "spark.default.parallelism": "1", }) """ test """ lines = self._pipeline.parallelize([5, 1, 2, 0, 3, 4])\ .map(lambda x: str(x), serde=serde.of(str)) out1_path = self.generate_tmp_path() + '/output-1/' out2_path = self.generate_tmp_path() + '/output-2/' self._pipeline.write(lines, output.TextFile(out1_path) .sort() .partition(n = 2, partition_fn = lambda x, n: int(x) % n) ) self._pipeline.write(lines, output.TextFile(out2_path) .sort(reverse=True) .partition(n = 2, partition_fn = lambda x, n: int(x) % n) ) self._pipeline.run() l11 = self._pipeline.read(input.TextFile(out1_path + '/part-00000'))\ .accumulate('', lambda x, y: x + y) l12 = self._pipeline.read(input.TextFile(out1_path + '/part-00001'))\ .accumulate('', lambda x, y: x + y) l21 = self._pipeline.read(input.TextFile(out2_path + '/part-00000'))\ .accumulate('', lambda x, y: x + y) l22 = self._pipeline.read(input.TextFile(out2_path + '/part-00001'))\ .accumulate('', lambda x, y: x + y) l11.cache() l12.cache() l21.cache() l22.cache() self.assertEqual('024', l11.get()) self.assertEqual('135', l12.get()) self.assertEqual('420', l21.get()) self.assertEqual('531', l22.get())
def end_serde_test(self): """ test """ import sys from bigflow.core import entity logger.info(str(self._checking_condition)) values = map(lambda condition: condition[1], self._checking_condition) p_values = self._pipeline.parallelize([values]) # 避免map结点超过32个(Hadoop的限制) p_value_list = [] out = [] for (i, (sd, value)) in enumerate(self._checking_condition): sd1 = serde.of(int) sd2 = sd cpp_deserialize_fn = entity.KVDeserializeFn(sd1, sd2) cpp_serialize_fn = entity.KVSerializeFn(sd1, sd2) python_deserialize_fn = lambda kv: (sd1.deserialize(kv[0]), sd2.deserialize(kv[1])) python_serialize_fn = lambda kv: (sd1.serialize(kv[0]), sd2.serialize(kv[1])) serialize_fns = [cpp_serialize_fn, python_serialize_fn] deserialize_fns = [cpp_deserialize_fn, python_deserialize_fn] kv_val = (1, value) def _assert_eq_val(v): assert v == kv_val for serialize_fn in serialize_fns: for deserialize_fn in deserialize_fns: out.append(p_values.map(lambda x: (1, x[i])) .map(serialize_fn) .map(deserialize_fn) .map(_assert_eq_val)) if out: transforms.union(*out).cache() else: print >> sys.stderr, "SKIP a test!!!" self._pipeline.run()
def serde_eq(self, expect, real): """ inner """ self.assertEqual(str(serde.of(expect)), str(serde.of(real)))
def transform_from_node(self, load_node, pipeline): """ 内部接口 """ from bigflow import schema if self.fields is None: raise ValueError('''columns is necessary,(1) columns(list), each item in columns is string, SchemaPCollection's element is dict, (2) columns(int),SchemaPCollection's element is tuple. eg. columns=3 or columns=[(xx, int), (yy, str)] or columns=[xx, yy], (3) columns(list), each item in columns is base type in [int, float, str]''' ) if isinstance(self.fields, tuple): self.fields = list(self.fields) fields_type = [] ignore_overflow = self.ignore_overflow ignore_illegal_line = self.ignore_illegal_line if isinstance(self.fields, list): def get_fields_type(fields): """内部函数""" fields_type = [] for field in fields: if isinstance(field, tuple): if field[1] in [int, str, float]: fields_type.append(field[1]) else: raise ValueError( '''columns is list(field name or data type), data type(int/str/float)''') elif field in [int, str, float]: fields_type.append(field) elif isinstance(field, str): fields_type.append(str) else: raise ValueError( '''columns is list(field name or data type), data type(int/str/float)''') return fields_type fields_type = get_fields_type(self.fields) ret = super(SchemaTextFile, self)\ .transform_from_node(load_node, pipeline)\ .flat_map(entity.SplitStringToTypes(self.sep, fields_type, ignore_overflow, ignore_illegal_line), serde=serde.of(tuple(fields_type))) if self.fields[0] in [int, float, str]: return ret else: ret = ret.apply(schema.tuple_to_dict, self.fields) return ret elif isinstance(self.fields, int): from bigflow import schema_pcollection return schema_pcollection.SchemaPCollection(super(SchemaTextFile, self) .transform_from_node(load_node, pipeline)\ .flat_map(entity.SplitStringToTypes(self.sep, [str for _ in xrange(self.fields)], True, ignore_illegal_line), serde=serde.of(tuple(serde.StrSerde() for index in xrange(self.fields))))) else: raise ValueError("columns is list(field name),or int(row number)")
def test_get_tuple_serde_of_fields(self): """ test """ sd = schema.FieldsDictSerde({'id': int, 'name': str, 'age': int}) self.assertEqual( str(serde.of((int, str))), str(schema._get_tuple_serde_of_fields(sd, ['id', 'name'])))
def _get_tuple_serde(self, fields): """ 内部函数 """ return serde.of(tuple(self._fields_to_types[key] for key in fields))
def get_serde_of_field(sd, field, default=None): """ get serde of field """ if isinstance(sd, FieldsDictSerde): return serde.of(serde.origin(sd._tuple_serde)[sd._fields.index(field)]) return default
def test_sort_str(self): """ test """ data = self._pipeline.parallelize([4, 5, 1, 2, 3, 0]) lines = data.map(lambda x: str(x), serde=serde.of(str)) self.assertEqual('012345', lines.sort().accumulate('', lambda x, y: x + y).get())
def filter_count_ne(a, b): return a.count() \ .flat_map(lambda c1, c2: [(c1, c2)], b.count(), serde = serde.of((int, int))) \ .filter(lambda tp: tp[0] != tp[1])
def __right_join_in_every_group(*pcollections, **options): serdes = serde.of(tuple(map(lambda p: p.serde(), pcollections))) return __left_join_in_every_group(*pcollections[::-1], **options).map(lambda x: x[::-1], serde=serdes)
def __init__(self, fields_to_types): if not isinstance(fields_to_types, dict): fields_to_types = dict(zip(fields_to_types, [serde._] * len(fields_to_types))) self._fields_to_types = fields_to_types self._fields = sorted(fields_to_types.keys()) self._tuple_serde = serde.of(tuple(fields_to_types[key] for key in self._fields))
def _get_serde_of_field(sd, field, dft=None): """ 内部函数 """ if isinstance(sd, FieldsDictSerde): return serde.of(serde.origin(sd._tuple_serde)[sd._fields.index(field)]) return dft
def __right_join_in_every_group(*pcollections, **options): serdes = serde.of(tuple(map(lambda p: p.serde(), pcollections))) return __left_join_in_every_group(*pcollections[::-1], **options).map(lambda x: x[::-1], serde = serdes)
def _get_tuple_serde_of_fields(sd, fields, dft=None): """ 内部函数 """ return serde.of( tuple(_get_serde_of_field(sd, field, dft) for field in fields))
def _get_col(index): return val.map(lambda v: v[index], serde=serde.of(_origin_serde(val.serde())[index]))
def pipe(pvalue, command, **options): """ Transform pipe implementation :param pvalue: PType :return: PCollection """ if utils.is_infinite(pvalue): raise ValueError("pipe not supported infinite PType") if isinstance(pvalue, ptable.PTable): def merge_value(pvalue): """ inner """ if isinstance(pvalue, ptable.PTable): return pvalue.apply_values(merge_value) else: return pvalue.apply(transforms.to_list_pobject) def merge_kv(tp, level): """ inner """ kvs=[] for i in xrange(level): kvs.append(tp[0]) tp = tp[1] kvs.append(tp) return kvs level = pvalue.nested_level() + 1 transformed = pvalue.apply(merge_value).flatten() \ .apply(transforms.map, lambda kv: merge_kv(kv, level), serde=serde.of(pvalue.key_serdes() + [pvalue.serde()])) options['input_fields_num'] = level + 1 options['is_nested_ptype'] = True else: transformed = pvalue output_fields_num = options.get('output_fields_num', 1) if output_fields_num == 1: options['serde'] = serde.StrSerde() else: serdes = [serde.StrSerde()] * output_fields_num options['serde'] = serde.TupleSerde(*serdes) scale = options.get('scale', 1.0) size = options.get('output_size', None) memory = options.get('memory_limit', -1) cpu = options.get('cpu_limit', -1) result_node = transformed.node() \ .process_by(entity.PipeProcessor(command, **options)) \ .as_type(options['serde']) \ .set_debug_info("Pipe: " + repr(command)) \ .ignore_group() \ .set_effective_key_num(0) \ .input(-1).allow_partial_processing().done() \ .set_size(size, scale) \ .set_memory(memory) \ .set_cpu(cpu) return pcollection.PCollection(result_node, transformed.pipeline())
def is_empty(pcollection): return pcollection.take(1).count().map(lambda n: n == 0, serde = serde.of(bool))
def serde_equal(self, expect, real): self.assertEqual(str(serde.of(expect)), str(serde.of(real)))
def _get_tuple_serde_of_fields(sd, fields, dft=None): """ 内部函数 """ return serde.of(tuple(_get_serde_of_field(sd, field, dft) for field in fields))
def test_get_serde_of_fields(self): """ test """ sd = fields.FieldsDictSerde({'id': int, 'name': str, 'age': int}) self.assertEqual(str(serde.of(str)), str(fields.get_serde_of_field(sd, 'name'))) self.assertEqual(str(fields.of({'id':int, 'name': str})), str(fields.get_serde_of_fields(sd, ['id', 'name'])))