def reduce_group(self, operator, types, combinable=False): """ Applies a GroupReduce transformation. The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a non-grouped DataSet. The GroupReduceFunction can iterate over all elements of the DataSet and emit any number of output elements including none. :param operator: The GroupReduceFunction that is applied on the DataSet. :param types: The type of the resulting DataSet. :return:A GroupReduceOperator that represents the reduced DataSet. """ if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f operator._set_grouping_keys(self._child_chain[0].keys) operator._set_sort_ops([(x.field, x.order) for x in self._child_chain[1:]]) child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = copy.deepcopy(operator) child.operator._combine = False child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.types = types child.combine = combinable child.combineop = operator child.combineop._combine = True child.name = "PythonGroupReduce" self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce(self, operator): """ Applies a Reduce transformation on a non-grouped DataSet. The transformation consecutively calls a ReduceFunction until only a single element remains which is the result of the transformation. A ReduceFunction combines two elements into one new element of the same type. :param operator:The ReduceFunction that is applied on the DataSet. :return:A ReduceOperator that represents the reduced DataSet. """ operator._set_grouping_keys(self._child_chain[0].keys) for i in self._child_chain: self._env._sets.append(i) child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.REDUCE child.parent = self._info child.operator = copy.deepcopy(operator) child.operator._combine = False child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.combine = True child.combineop = operator child.combineop._combine = True child.name = "PythonReduce" child.types = deduct_output_type(self._info) self._info.children.append(child) self._env._sets.append(child) return child_set
def map(self, operator, types): """ Applies a Map transformation on a DataSet. The transformation calls a MapFunction for each element of the DataSet. Each MapFunction call returns exactly one element. :param operator: The MapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet :return:A MapOperator that represents the transformed DataSet """ if isinstance(operator, TYPES.FunctionType): f = operator operator = MapFunction() operator.map = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.MAP child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.types = types child.name = "PythonMap" self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce(self, operator): """ Applies a Reduce transformation on a non-grouped DataSet. The transformation consecutively calls a ReduceFunction until only a single element remains which is the result of the transformation. A ReduceFunction combines two elements into one new element of the same type. :param operator:The ReduceFunction that is applied on the DataSet. :return:A ReduceOperator that represents the reduced DataSet. """ operator._set_grouping_keys(self._child_chain[0].keys) for i in self._child_chain: self._env._sets.append(i) child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.REDUCE child.parent = self._info child.operator = copy.deepcopy(operator) child.operator._combine = False child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.combine = True child.combineop = operator child.combineop._combine = True child.name = "PythonReduce" child.types = deduct_output_type(self._info) self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce(self, operator): """ Applies a Reduce transformation on a non-grouped DataSet. The transformation consecutively calls a ReduceFunction until only a single element remains which is the result of the transformation. A ReduceFunction combines two elements into one new element of the same type. :param operator:The ReduceFunction that is applied on the DataSet. :return:A ReduceOperator that represents the reduced DataSet. """ self._finalize() if isinstance(operator, TYPES.FunctionType): f = operator operator = ReduceFunction() operator.reduce = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.REDUCE child.parent = self._info child.operator = operator child.name = "PythonReduce" child.types = _createArrayTypeInfo() child.key1 = self._child_chain[0].keys self._info.parallelism = child.parallelism self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce_group(self, operator, combinable=False): """ Applies a GroupReduce transformation. The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a non-grouped DataSet. The GroupReduceFunction can iterate over all elements of the DataSet and emit any number of output elements including none. :param operator: The GroupReduceFunction that is applied on the DataSet. :return:A GroupReduceOperator that represents the reduced DataSet. """ self._finalize() if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = operator child.types = _createArrayTypeInfo() child.name = "PythonGroupReduce" child.key1 = self._child_chain[0].keys self._info.children.append(child) self._env._sets.append(child) return child_set
def filter(self, operator): """ Applies a Filter transformation on a DataSet. he transformation calls a FilterFunction for each element of the DataSet and retains only those element for which the function returns true. Elements for which the function returns false are filtered. :param operator: The FilterFunction that is called for each element of the DataSet. :return:A FilterOperator that represents the filtered DataSet. """ if isinstance(operator, TYPES.FunctionType): f = operator operator = FilterFunction() operator.filter = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.FILTER child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.name = "PythonFilter" child.types = deduct_output_type(self._info) self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce_group(self, operator, combinable=False): """ Applies a GroupReduce transformation. The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a non-grouped DataSet. The GroupReduceFunction can iterate over all elements of the DataSet and emit any number of output elements including none. :param operator: The GroupReduceFunction that is applied on the DataSet. :return:A GroupReduceOperator that represents the reduced DataSet. """ self._finalize() if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = operator child.types = _createArrayTypeInfo() child.name = "PythonGroupReduce" child.key1 = self._child_chain[0].keys self._info.parallelism = child.parallelism self._info.children.append(child) self._env._sets.append(child) return child_set
def map_partition(self, operator): """ Applies a MapPartition transformation on a DataSet. The transformation calls a MapPartitionFunction once per parallel partition of the DataSet. The entire partition is available through the given Iterator. Each MapPartitionFunction may return an arbitrary number of results. The number of elements that each instance of the MapPartition function sees is non deterministic and depends on the degree of parallelism of the operation. :param operator: The MapFunction that is called for each element of the DataSet. :return:A MapOperator that represents the transformed DataSet """ if isinstance(operator, TYPES.FunctionType): f = operator operator = MapPartitionFunction() operator.map_partition = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.MAPPARTITION child.parent = self._info child.operator = operator child.types = _createArrayTypeInfo() child.name = "PythonMapPartition" self._info.children.append(child) self._env._sets.append(child) return child_set
def flat_map(self, operator, types): """ Applies a FlatMap transformation on a DataSet. The transformation calls a FlatMapFunction for each element of the DataSet. Each FlatMapFunction call can return any number of elements including none. :param operator: The FlatMapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet. :return:A FlatMapOperator that represents the transformed DataSe """ if isinstance(operator, TYPES.FunctionType): f = operator operator = FlatMapFunction() operator.flat_map = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.FLATMAP child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str(operator.__class__.__name__) child.types = types child.name = "PythonFlatMap" self._info.children.append(child) self._env._sets.append(child) return child_set
def reduce_group(self, operator, types, combinable=False): """ Applies a GroupReduce transformation. The transformation calls a GroupReduceFunction once for each group of the DataSet, or one when applied on a non-grouped DataSet. The GroupReduceFunction can iterate over all elements of the DataSet and emit any number of output elements including none. :param operator: The GroupReduceFunction that is applied on the DataSet. :param types: The type of the resulting DataSet. :return:A GroupReduceOperator that represents the reduced DataSet. """ if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = copy.deepcopy(operator) child.operator._combine = False child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.types = types child.combine = combinable child.combineop = operator child.combineop._combine = True child.name = "PythonGroupReduce" self._info.children.append(child) self._env._sets.append(child) return child_set
def map(self, operator, types): """ Applies a Map transformation on a DataSet. The transformation calls a MapFunction for each element of the DataSet. Each MapFunction call returns exactly one element. :param operator: The MapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet :return:A MapOperator that represents the transformed DataSet """ if isinstance(operator, TYPES.FunctionType): f = operator operator = MapFunction() operator.map = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.MAP child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.types = types child.name = "PythonMap" self._info.children.append(child) self._env._sets.append(child) return child_set
def flat_map(self, operator, types): """ Applies a FlatMap transformation on a DataSet. The transformation calls a FlatMapFunction for each element of the DataSet. Each FlatMapFunction call can return any number of elements including none. :param operator: The FlatMapFunction that is called for each element of the DataSet. :param types: The type of the resulting DataSet. :return:A FlatMapOperator that represents the transformed DataSe """ if isinstance(operator, TYPES.FunctionType): f = operator operator = FlatMapFunction() operator.flat_map = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.FLATMAP child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.types = types child.name = "PythonFlatMap" self._info.children.append(child) self._env._sets.append(child) return child_set
def filter(self, operator): """ Applies a Filter transformation on a DataSet. he transformation calls a FilterFunction for each element of the DataSet and retains only those element for which the function returns true. Elements for which the function returns false are filtered. :param operator: The FilterFunction that is called for each element of the DataSet. :return:A FilterOperator that represents the filtered DataSet. """ if isinstance(operator, TYPES.FunctionType): f = operator operator = FilterFunction() operator.filter = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.FILTER child.parent = self._info child.operator = operator child.meta = str(inspect.getmodule(operator)) + "|" + str( operator.__class__.__name__) child.name = "PythonFilter" child.types = deduct_output_type(self._info) self._info.children.append(child) self._env._sets.append(child) return child_set
def _createProjector(env, info): child = OperationInfo() child_set = Projector(env, child) child.identifier = _Identifier.MAP child.operator = MapFunction() child.parent = info child.types = _createArrayTypeInfo() child.name = "Projector" info.children.append(child) env._sets.append(child) return child_set
def _reduce_group(self, operator, combinable=False): if isinstance(operator, TYPES.FunctionType): f = operator operator = GroupReduceFunction() operator.reduce = f child = OperationInfo() child.identifier = _Identifier.GROUPREDUCE child.parent = self._info child.operator = operator child.types = _createArrayTypeInfo() child.name = "PythonGroupReduce" return child
def read_custom(self, path, filter, splits, format): """ Creates a DataSet using a custom input format that is executed directly in the Python process. """ child = OperationInfo() child_set = DataSet(self, child) child.identifier = _Identifier.SOURCE_CUSTOM child.name = "PythonInputFormat" child.path = path child.filter = filter child.computeSplits = splits child.operator = copy.deepcopy(format) child.types = _createArrayTypeInfo() self._sources.append(child) return child_set
def map(self, operator): """ Applies a Map transformation on a DataSet. The transformation calls a MapFunction for each element of the DataSet. Each MapFunction call returns exactly one element. :param operator: The MapFunction that is called for each element of the DataSet. :return:A MapOperator that represents the transformed DataSet """ if isinstance(operator, TYPES.FunctionType): f = operator operator = MapFunction() operator.map = f child = OperationInfo() child_set = OperatorSet(self._env, child) child.identifier = _Identifier.MAP child.parent = self._info child.operator = operator child.types = _createArrayTypeInfo() child.name = "PythonMap" self._info.children.append(child) self._env._sets.append(child) return child_set