def equal_to(self, *fields): """ Continues a Join transformation. Defines the Tuple fields of the second join DataSet that should be used as join keys. Note: Fields can only be selected as join keys on Tuple DataSets. :param fields:The indexes of the Tuple fields of the second join DataSet that should be used as keys. :return:An incomplete Join Transformation. """ f = None if isinstance(fields[0], TYPES.FunctionType): f = lambda x: (fields[0](x), ) if isinstance(fields[0], KeySelectorFunction): f = lambda x: (fields[0].get_key(x), ) if f is None: f = lambda x: tuple([x[key] for key in fields]) new_other_set = self._info.other_set.map(lambda x: (f(x), x)) new_other_set._info.types = _createKeyValueTypeInfo(len(fields)) self._info.other = new_other_set._info self._info.other.parallelism = self._info.parallelism self._info.other.children.append(self._info) self._info.key2 = tuple([x for x in range(len(fields))]) self._env._sets.append(self._info) return JoinOperator(self._env, self._info)
def equal_to(self, *fields): """ Continues a Join transformation. Defines the Tuple fields of the second join DataSet that should be used as join keys. Note: Fields can only be selected as join keys on Tuple DataSets. :param fields:The indexes of the Tuple fields of the second join DataSet that should be used as keys. :return:An incomplete Join Transformation. """ f = None if isinstance(fields[0], TYPES.FunctionType): f = lambda x: (fields[0](x),) if isinstance(fields[0], KeySelectorFunction): f = lambda x: (fields[0].get_key(x),) if f is None: f = lambda x: tuple([x[key] for key in fields]) new_other_set = self._info.other_set.map(lambda x: (f(x), x)) new_other_set._info.types = _createKeyValueTypeInfo(len(fields)) self._info.other = new_other_set._info self._info.other.parallelism = self._info.parallelism self._info.other.children.append(self._info) self._info.key2 = tuple([x for x in range(len(fields))]) self._env._sets.append(self._info) return JoinOperator(self._env, self._info)
def _finalize(self): grouping = self._child_chain[0] sortings = self._child_chain[1:] #list of used index keys to prevent duplicates and determine final index index_keys = set() if not isinstance(grouping.keys[0], (TYPES.FunctionType, KeySelectorFunction)): index_keys = index_keys.union(set(grouping.keys)) #list of sorts using indices index_sorts = [] #list of sorts using functions ksl_sorts = [] for s in sortings: if not isinstance(s.field, (TYPES.FunctionType, KeySelectorFunction)): index_keys.add(s.field) index_sorts.append(s) else: ksl_sorts.append(s) used_keys = sorted(index_keys) #all data gathered #construct list of extractor lambdas lambdas = [] i = 0 for key in used_keys: lambdas.append(lambda x, k=key: x[k]) i += 1 if isinstance(grouping.keys[0], (TYPES.FunctionType, KeySelectorFunction)): lambdas.append(grouping.keys[0]) for ksl_op in ksl_sorts: lambdas.append(ksl_op.field) grouping.parent.operator.map = lambda x: (tuple( [l(x) for l in lambdas]), x) grouping.parent.types = _createKeyValueTypeInfo(len(lambdas)) #modify keys ksl_offset = len(used_keys) if not isinstance(grouping.keys[0], (TYPES.FunctionType, KeySelectorFunction)): grouping.keys = tuple( [used_keys.index(key) for key in grouping.keys]) else: grouping.keys = (ksl_offset, ) ksl_offset += 1 for iop in index_sorts: iop.field = used_keys.index(iop.field) for kop in ksl_sorts: kop.field = ksl_offset ksl_offset += 1
def _distinct(self, fields): self._info.types = _createKeyValueTypeInfo(len(fields)) child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.DISTINCT child.parent = self._info child.keys = fields self._info.children.append(child) self._env._sets.append(child) return child_set
def _finalize(self): grouping = self._child_chain[0] sortings = self._child_chain[1:] #list of used index keys to prevent duplicates and determine final index index_keys = set() if not isinstance(grouping.keys[0], (TYPES.FunctionType, KeySelectorFunction)): index_keys = index_keys.union(set(grouping.keys)) #list of sorts using indices index_sorts = [] #list of sorts using functions ksl_sorts = [] for s in sortings: if not isinstance(s.field, (TYPES.FunctionType, KeySelectorFunction)): index_keys.add(s.field) index_sorts.append(s) else: ksl_sorts.append(s) used_keys = sorted(index_keys) #all data gathered #construct list of extractor lambdas lambdas = [] i = 0 for key in used_keys: lambdas.append(lambda x, k=key: x[k]) i += 1 if isinstance(grouping.keys[0], (TYPES.FunctionType, KeySelectorFunction)): lambdas.append(grouping.keys[0]) for ksl_op in ksl_sorts: lambdas.append(ksl_op.field) grouping.parent.operator.map = lambda x: (tuple([l(x) for l in lambdas]), x) grouping.parent.types = _createKeyValueTypeInfo(len(lambdas)) #modify keys ksl_offset = len(used_keys) if not isinstance(grouping.keys[0], (TYPES.FunctionType, KeySelectorFunction)): grouping.keys = tuple([used_keys.index(key) for key in grouping.keys]) else: grouping.keys = (ksl_offset,) ksl_offset += 1 for iop in index_sorts: iop.field = used_keys.index(iop.field) for kop in ksl_sorts: kop.field = ksl_offset ksl_offset += 1
def _finalize(self): grouping = self._child_chain[0] keys = grouping.keys f = None if isinstance(keys[0], TYPES.FunctionType): f = lambda x: (keys[0](x),) if isinstance(keys[0], KeySelectorFunction): f = lambda x: (keys[0].get_key(x),) if f is None: f = lambda x: tuple([x[key] for key in keys]) grouping.parent.operator.map = lambda x: (f(x), x) grouping.parent.types = _createKeyValueTypeInfo(len(keys)) grouping.keys = tuple([i for i in range(len(grouping.keys))])
def _finalize(self): grouping = self._child_chain[0] keys = grouping.keys f = None if isinstance(keys[0], TYPES.FunctionType): f = lambda x: (keys[0](x), ) if isinstance(keys[0], KeySelectorFunction): f = lambda x: (keys[0].get_key(x), ) if f is None: f = lambda x: tuple([x[key] for key in keys]) grouping.parent.operator.map = lambda x: (f(x), x) grouping.parent.types = _createKeyValueTypeInfo(len(keys)) grouping.keys = tuple([i for i in range(len(grouping.keys))])
def _partition_by_hash(self, fields): """ Hash-partitions a DataSet on the specified key fields. Important:This operation shuffles the whole DataSet over the network and can take significant amount of time. :param fields: The field indexes on which the DataSet is hash-partitioned. :return: The partitioned DataSet. """ self._info.types = _createKeyValueTypeInfo(len(fields)) child = OperationInfo() child_set = DataSet(self._env, child) child.identifier = _Identifier.PARTITION_HASH child.parent = self._info child.keys = fields self._info.children.append(child) self._env._sets.append(child) return child_set
def equal_to(self, *fields): """ Continues a CoGroup transformation. Defines the Tuple fields of the second co-grouped DataSet that should be used as grouping keys. Note: Fields can only be selected as grouping keys on Tuple DataSets. :param fields: The indexes of the Tuple fields of the second co-grouped DataSet that should be used as keys. :return: An incomplete CoGroup transformation. """ f = None if isinstance(fields[0], TYPES.FunctionType): f = lambda x: (fields[0](x),) if isinstance(fields[0], KeySelectorFunction): f = lambda x: (fields[0].get_key(x),) if f is None: f = lambda x: tuple([x[key] for key in fields]) new_other_set = self._info.other_set.map(lambda x: (f(x), x)) new_other_set._info.types = _createKeyValueTypeInfo(len(fields)) self._info.other = new_other_set._info self._info.other.children.append(self._info) self._info.key2 = fields return CoGroupOperatorUsing(self._env, self._info)
def equal_to(self, *fields): """ Continues a CoGroup transformation. Defines the Tuple fields of the second co-grouped DataSet that should be used as grouping keys. Note: Fields can only be selected as grouping keys on Tuple DataSets. :param fields: The indexes of the Tuple fields of the second co-grouped DataSet that should be used as keys. :return: An incomplete CoGroup transformation. """ f = None if isinstance(fields[0], TYPES.FunctionType): f = lambda x: (fields[0](x), ) if isinstance(fields[0], KeySelectorFunction): f = lambda x: (fields[0].get_key(x), ) if f is None: f = lambda x: tuple([x[key] for key in fields]) new_other_set = self._info.other_set.map(lambda x: (f(x), x)) new_other_set._info.types = _createKeyValueTypeInfo(len(fields)) self._info.other = new_other_set._info self._info.other.children.append(self._info) self._info.key2 = fields return CoGroupOperatorUsing(self._env, self._info)