def _key(self, ensure_keep_group=False): ''' 内部函数 ensure_keep_group的话,则返回至少发一条数据给reduce,确保每个group都保留着。 否则,依赖于其它结点产生group。 ''' value = self._value() value = value[0] if isinstance(value, tuple) else value take_num = 1 if ensure_keep_group else 0 if self.__key is None: import bigflow.transforms from bigflow.core import entity key_serde = self.key_serdes()[0] deserialize = entity.SerdeWrapper(key_serde, is_serialize=False) key_node = bigflow.transforms.flatten_values(value).node() \ .process_by(entity.TakeProcessor(take_num)) \ .as_type(value.serde()) \ .set_debug_info("ExtractKeyPartial") \ .input(0).allow_partial_processing().done() \ .process_by(entity.GetLastKeyProcessor(deserialize)) \ .as_type(key_serde) \ .set_debug_info("ExtractKey") self.__key = pobject.PObject(key_node, self._pipeline) return self.__key
def transform_to_node(self, ptype): from bigflow.core import entity from bigflow import pcollection node = ptype.node() plan = node.plan() objector = self.options.get('serde', ptype.pipeline().default_objector()) shuffle_scope = plan.shuffle(plan.global_scope(), [node]) node = shuffle_scope.node(0) if self.partition_fn is None: node = node.distribute_by_default() else: node = node.distribute_by(entity.Partitioner(self.partition_fn)) pvalue = pcollection.PCollection(node, ptype.pipeline()) for k, action in self.transform_actions.items(): pvalue = action(pvalue) node = pvalue.node() if self.partition_number is not None: shuffle_scope.with_concurrency(self.partition_number) if self.key_reader_obj is not None: node = node.sort_by(self.key_reader_obj) #serialize = objector.serialize is_serialize = True serialize = entity.SerdeWrapper(objector, is_serialize) if self.kv_serializer is not None: serialized = pcollection.PCollection(node, ptype.pipeline()).map( self.kv_serializer).node() else: serialized = pcollection.PCollection( node, ptype.pipeline()).map(serialize).node() node = serialized.process_by(_ToRecord(self.kv_serializer)) \ .as_type(record_objector.RecordObjector()) \ .set_effective_key_num(0) \ .input(0) \ .done() \ .ignore_group() return node
def str_to_idl(pcollection, **options): """ 对于给定的PCollection,对每条数据执行idl打包。要求输入的数据类型为str。 Args: pcollection (PCollection): 输入 **options: 可配置选项 log_type: idl数据类型,目前支持log_text和log_bin,默认为log_text Returns: PCollection: 处理后的PCollection """ from bigflow import serde from bigflow.core import entity log_type = options.get("log_type", "log_text") idl_serde = serde.IdlPacketSerde(log_type=log_type) serialize = entity.SerdeWrapper(idl_serde, is_serialize=True) return pcollection.map(serialize, serde=serde.StrSerde())
def idl_to_str(pcollection, **options): """ 对于给定的PCollection,对每条数据执行idl解包。并过滤掉idl packet类型为Heartbeat和EOF的数据。 Args: pcollection (PCollection): 输入 **options: 可配置选项 log_type: idl数据类型,目前支持log_text和log_bin,默认为log_text Returns: PCollection: 处理后的PCollection """ from bigflow import serde from bigflow.core import entity log_type = options.get("log_type", "log_text") idl_serde = serde.IdlPacketSerde(log_type=log_type) deserialize = entity.SerdeWrapper(idl_serde, is_serialize=False) return pcollection.map(deserialize, serde=serde.StrSerde()) \ .filter(lambda x: x is not None)
def transform_from_node(self, load_node, pipeline): """ 内部接口 """ from bigflow import ptable if self.repeatedly: transformed = load_node.repeatedly() \ .process_by(_KVFromBinaryRecord()) \ .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \ .set_effective_key_num(0) \ .input(0).allow_partial_processing() \ .done() else: transformed = load_node \ .process_by(_KVFromBinaryRecord()) \ .as_type(serde.tuple_of(serde.StrSerde(), serde.StrSerde())) \ .set_effective_key_num(0) \ .ignore_group() \ .input(0).allow_partial_processing() \ .done() transformed.set_size(load_node.size()) transformed = pcollection.PCollection(transformed, pipeline) tserde = self._options.get('serde', pipeline.default_objector()) if self.kv_deserializer is not None: transformed = transformed.map(self.kv_deserializer, serde=tserde) else: is_serialize = False deserialize = entity.SerdeWrapper(tserde, is_serialize, 1) transformed = transformed.map(deserialize, serde=tserde) if self._options.get('partitioned'): return ptable.PTable(transformed, key_serde=serde.StrSerde()) return pcollection.PCollection(transformed.node().leave_scope(), pipeline)