def _map_fields(self, field_map, field_sources, field_sinks): rtn_sinks = None if field_sinks is None: left_sinks = list() for src in field_sources: src_ = val.array_from_parameter(self, 'left_field_sources', src) snk_ = ops.map_valid(src_, field_map) left_sinks.append(snk_) rtn_sinks = left_sinks elif val.is_field_parameter(field_sinks[0]): # groups or fields for src, snk in zip(field_sources, field_sinks): src_ = val.array_from_parameter(self, 'left_field_sources', src) snk_ = ops.map_valid(src_, field_map) snk = val.field_from_parameter(self, 'left_field_sinks', snk) snk.data.write(snk_) else: # raw arrays for src, snk in zip(field_sources, field_sinks): src_ = val.array_from_parameter(self, 'left_field_sources', src) snk_ = val.array_from_parameter(self, 'left_field_sinks', snk) ops.map_valid(src_, field_map, snk_) return None if rtn_sinks is None else tuple(rtn_sinks)
def get_spans(self, field=None, fields=None): """ Calculate a set of spans that indicate contiguous equal values. The entries in the result array correspond to the inclusive start and exclusive end of the span (the ith span is represented by element i and element i+1 of the result array). The last entry of the result array is the length of the source field. Only one of 'field' or 'fields' may be set. If 'fields' is used and more than one field specified, the fields are effectively zipped and the check for spans is carried out on each corresponding tuple in the zipped field. Example: field: [1, 2, 2, 1, 1, 1, 3, 4, 4, 4, 2, 2, 2, 2, 2] result: [0, 1, 3, 6, 7, 10, 15] """ if field is None and fields is None: raise ValueError("One of 'field' and 'fields' must be set") if field is not None and fields is not None: raise ValueError("Only one of 'field' and 'fields' may be set") raw_field = None raw_fields = None if field is not None: raw_field = val.array_from_parameter(self, 'field', field) raw_fields = [] if fields is not None: for i_f, f in enumerate(fields): raw_fields.append( val.array_from_parameter(self, "'fields[{}]'".format(i_f), f)) return per._get_spans(raw_field, raw_fields)
def apply_index(self, index_to_apply, src, dest=None): """ Apply a index to an a src field. The indexed field is written to dest if it set, and returned from the function call. If the field is an IndexedStringField, the indices and values are returned separately. :param index_to_apply: the index to be applied to the source field :param src: the field to be index :param dest: optional - a field to write the indexed data to :return: the indexed values """ index_to_apply_ = val.array_from_parameter(self, 'index_to_apply', index_to_apply) writer_ = None if dest is not None: writer_ = val.field_from_parameter(self, 'writer', dest) if isinstance(src, fld.IndexedStringField): src_ = val.field_from_parameter(self, 'reader', src) dest_indices, dest_values =\ ops.apply_indices_to_index_values(index_to_apply_, src_.indices[:], src_.values[:]) if writer_ is not None: writer_.indices.write(dest_indices) writer_.values.write(dest_values) return dest_indices, dest_values else: reader_ = val.array_from_parameter(self, 'reader', src) result = reader_[index_to_apply] if writer_: writer_.data.write(result) return result
def _apply_spans_src(self, predicate, spans, src, dest=None): assert (dest is None or isinstance(dest, fld.Field)) src_ = val.array_from_parameter(self, 'src', src) if len(src) != spans[-1]: error_msg = ( "'src' (length {}) must be one element shorter than 'spans' " "(length {})") raise ValueError(error_msg.format(len(src_), len(spans))) if dest is not None: dest_f = val.field_from_parameter(self, 'dest', dest) results = np.zeros(len(spans) - 1, dtype=dest_f.data.dtype) predicate(spans, src_, results) dest_f.data.write(results) return results else: results = np.zeros(len(spans) - 1, dtype=src_.dtype) predicate(spans, src_, results) return results
def ordered_merge_left(self, left_on, right_on, right_field_sources=tuple(), left_field_sinks=None, left_to_right_map=None, left_unique=False, right_unique=False): """ Generate the results of a left join apply it to the fields described in the tuple 'left_field_sources'. If 'left_field_sinks' is set, the mapped values are written to the fields / arrays set there. Note: in order to achieve best scalability, you should use groups / fields rather than numpy arrays and provide a tuple of groups/fields to left_field_sinks, so that the session and compute the merge and apply the mapping in a streaming fashion. :param left_on: the group/field/numba array that contains the left key values :param right_on: the group/field/numba array that contains the right key values :param left_to_right_map: a group/field/numba array that the map is written to. If it is a numba array, it must be the size of the resulting merge :param left_field_sources: a tuple of group/fields/numba arrays that contain the fields to be joined :param left_field_sinks: optional - a tuple of group/fields/numba arrays that the mapped fields should be written to :param left_unique: a hint to indicate whether the 'left_on' field contains unique values :param right_unique: a hint to indicate whether the 'right_on' field contains unique values :return: If left_field_sinks is not set, a tuple of the output fields is returned """ if left_field_sinks is not None: if len(right_field_sources) != len(left_field_sinks): msg = ( "{} and {} should be of the same length but are length {} and {} " "respectively") raise ValueError( msg.format(len(right_field_sources), len(left_field_sinks))) val.all_same_basic_type('left_field_sources', right_field_sources) if left_field_sinks and len(left_field_sinks) > 0: val.all_same_basic_type('left_field_sinks', left_field_sinks) streamable = val.is_field_parameter(left_on) and \ val.is_field_parameter(right_on) and \ val.is_field_parameter(right_field_sources[0]) and \ left_field_sinks is not None and \ val.is_field_parameter(left_field_sinks[0]) and \ left_to_right_map is not None result = None has_unmapped = None if left_unique == False: if right_unique == False: raise ValueError("Right key must not have duplicates") else: if streamable: has_unmapped = \ ops.ordered_map_to_right_right_unique_streamed(left_on, right_on, left_to_right_map) result = left_to_right_map else: result = np.zeros(len(left_on), dtype=np.int64) left_data = val.array_from_parameter( self, "left_on", left_on) right_data = val.array_from_parameter( self, "right_on", right_on) has_unmapped = \ ops.ordered_map_to_right_right_unique( left_data, right_data, result) else: if right_unique == False: raise ValueError("Right key must not have duplicates") else: result = np.zeros(len(left_on), dtype=np.int64) left_data = val.array_from_parameter(self, "left_on", left_on) right_data = val.array_from_parameter(self, "right_on", right_on) has_unmapped = ops.ordered_map_to_right_both_unique( left_data, right_data, result) if streamable: self._streaming_map_fields(result, right_field_sources, left_field_sinks) return None else: rtn_left_sinks = self._map_fields(result, right_field_sources, left_field_sinks) return rtn_left_sinks
def ordered_merge_inner(self, left_on, right_on, left_field_sources=tuple(), left_field_sinks=None, right_field_sources=tuple(), right_field_sinks=None, left_unique=False, right_unique=False): if left_field_sinks is not None: if len(left_field_sources) != len(left_field_sinks): msg = ( "{} and {} should be of the same length but are length {} and {} " "respectively") raise ValueError( msg.format(len(left_field_sources), len(left_field_sinks))) val.all_same_basic_type('left_field_sources', left_field_sources) if left_field_sinks and len(left_field_sinks) > 0: val.all_same_basic_type('left_field_sinks', left_field_sinks) if right_field_sinks is not None: if len(right_field_sources) != len(right_field_sinks): msg = ( "{} and {} should be of the same length but are length {} and {} " "respectively") raise ValueError( msg.format(len(right_field_sources), len(right_field_sinks))) val.all_same_basic_type('right_field_sources', right_field_sources) if right_field_sinks and len(right_field_sinks) > 0: val.all_same_basic_type('right_field_sinks', right_field_sinks) left_data = val.array_from_parameter(self, 'left_on', left_on) right_data = val.array_from_parameter(self, 'right_on', right_on) result = None inner_length = ops.ordered_inner_map_result_size(left_data, right_data) left_to_inner = np.zeros(inner_length, dtype=np.int64) right_to_inner = np.zeros(inner_length, dtype=np.int64) if left_unique is False: if right_unique is False: ops.ordered_inner_map(left_data, right_data, left_to_inner, right_to_inner) else: ops.ordered_inner_map_left_unique(right_data, left_data, right_to_inner, left_to_inner) else: if right_unique is False: ops.ordered_inner_map_left_unique(left_data, right_data, left_to_inner, right_to_inner) else: ops.ordered_inner_map_both_unique(left_data, right_data, left_to_inner, right_to_inner) rtn_left_sinks = self._map_fields(left_to_inner, left_field_sources, left_field_sinks) rtn_right_sinks = self._map_fields(right_to_inner, right_field_sources, right_field_sinks) if rtn_left_sinks: if rtn_right_sinks: return rtn_left_sinks, rtn_right_sinks else: return rtn_left_sinks else: return rtn_right_sinks