def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" array_type = array.type if not query_path: if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: array = array_util.ToSingletonListArray(array) return array, example_indices if not pa.types.is_struct(get_innermost_nested_type(array_type)): raise KeyError('Cannot process query_path "{}" inside an array of type ' '{}. Expecting a struct<...> or ' '(large_)list...<struct<...>>.'.format( query_path, array_type)) flat_struct_array, parent_indices = flatten_nested( array, example_indices is not None) flat_indices = None if example_indices is not None: flat_indices = example_indices[parent_indices] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices)
def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, example_indices array_type = array.type if (not is_list_like(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a (large_)list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_indices = None if example_indices is not None: flat_indices = example_indices[ array_util.GetFlattenedArrayParentIndices(array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices)
def _get_sparse_feature( schema: schema_pb2.Schema, feature_path: types.FeaturePath) -> schema_pb2.SparseFeature: """Returns a sparse feature from the schema.""" if not isinstance(schema, schema_pb2.Schema): raise TypeError('schema is of type %s, should be a Schema proto.' % type(schema).__name__) feature_container = None parent = feature_path.parent() if parent: # Sparse features do not have a struct_domain and so can be only leaves. # Thus, we can assume that all parent steps are features, not sparse # features. feature_container = schema.feature for step in parent.steps(): f = schema_util.look_up_feature(step, feature_container) if f is None: raise ValueError('Feature %s not found in the schema.' % feature_path) if f.type != schema_pb2.STRUCT: raise ValueError( 'Step %s in feature %s does not refer to a valid STRUCT feature' % (step, feature_path)) feature_container = f.struct_domain.sparse_feature if feature_container is None: feature_container = schema.sparse_feature feature = _look_up_sparse_feature(feature_path.steps()[-1], feature_container) if feature is None: raise ValueError('Sparse Feature %s not found in the schema.' % feature_path) return feature
def _PartitionTransform(pcol, row_partitions: int, column_partitions: int, label_feature: types.FeaturePath, seed: int): """Ptransform wrapping _default_assign_to_partition.""" # We need to find the column name associated with the label path. steps = label_feature.steps() if not steps: raise ValueError("Empty label feature") label = steps[0] return pcol | "PartitionRowsCols" >> beam.ParDo( _PartitionFn(row_partitions, column_partitions, label, seed))
def get_array( table: pa.Table, query_path: types.FeaturePath, return_example_indices: bool) -> Tuple[pa.Array, Optional[np.ndarray]]: """Retrieve a nested array (and optionally example indices) from a table. It assumes all the columns in `table` have only one chunk. It assumes `table` contains only arrays of the following supported types: - list<primitive> - list<struct<[Ts]>> where Ts are the types of the fields in the struct type, and they can only be one of the supported types (recursion intended). If the provided path refers to a leaf in the table, then a ListArray with a primitive element type will be returned. If the provided path does not refer to a leaf, a ListArray with a StructArray element type will be returned. Args: table: The Table whose arrays to be visited. It is assumed that the table contains only one chunk. query_path: The FeaturePath to lookup in the table. return_example_indices: Whether to return an additional array containing the example indices of the elements in the array corresponding to the query_path. Returns: A tuple. The first term is the feature array and the second term is the example_indeices array for the feature array (i.e. array[i] came from the example at row example_indices[i] in the table.). Raises: KeyError: When the query_path is empty, or cannot be found in the table and its nested struct arrays. """ def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, example_indices array_type = array.type if (not is_list_like(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a (large_)list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_indices = None if example_indices is not None: flat_indices = example_indices[ array_util.GetFlattenedArrayParentIndices(array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices) if not query_path: raise KeyError('query_path must be non-empty.') column_name = query_path.steps()[0] try: array = table.column(column_name).data.chunk(0) except KeyError: raise KeyError( 'query_path step 0 "{}" not in table.'.format(column_name)) array_path = types.FeaturePath(query_path.steps()[1:]) example_indices = np.arange( table.num_rows) if return_example_indices else None return _recursion_helper(array_path, array, example_indices)
def get_array( record_batch: pa.RecordBatch, query_path: types.FeaturePath, return_example_indices: bool, wrap_flat_struct_in_list: bool = True, ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Retrieve a nested array (and optionally example indices) from RecordBatch. This function has the same assumption over `record_batch` as `enumerate_arrays()` does. If the provided path refers to a leaf in the `record_batch`, then a "nested_list" will be returned. If the provided path does not refer to a leaf, a "struct" with be returned. See `enumerate_arrays()` for definition of "nested_list" and "struct". Args: record_batch: The RecordBatch whose arrays to be visited. query_path: The FeaturePath to lookup in the record_batch. return_example_indices: Whether to return an additional array containing the example indices of the elements in the array corresponding to the query_path. wrap_flat_struct_in_list: if True, and if the query_path leads to a struct<[Ts]> array, it will be wrapped in a list array, where each sub-list contains one element. Caller can make use of this option to assume this function always returns a list<inner_type>. Returns: A tuple. The first term is the feature array and the second term is the example_indeices array for the feature array (i.e. array[i] came from the example at row example_indices[i] in the record_batch.). Raises: KeyError: When the query_path is empty, or cannot be found in the record_batch and its nested struct arrays. """ def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" array_type = array.type if not query_path: if pa.types.is_struct(array_type) and wrap_flat_struct_in_list: array = array_util.ToSingletonListArray(array) return array, example_indices if not pa.types.is_struct(get_innermost_nested_type(array_type)): raise KeyError('Cannot process query_path "{}" inside an array of type ' '{}. Expecting a struct<...> or ' '(large_)list...<struct<...>>.'.format( query_path, array_type)) flat_struct_array, parent_indices = flatten_nested( array, example_indices is not None) flat_indices = None if example_indices is not None: flat_indices = example_indices[parent_indices] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices) if not query_path: raise KeyError('query_path must be non-empty.') column_name = query_path.steps()[0] field_index = record_batch.schema.get_field_index(column_name) if field_index < 0: raise KeyError('query_path step 0 "{}" not in record batch.' .format(column_name)) array = record_batch.column(field_index) array_path = types.FeaturePath(query_path.steps()[1:]) example_indices = np.arange( record_batch.num_rows) if return_example_indices else None return _recursion_helper(array_path, array, example_indices)
def _prepend_slice_path(slice_name: str, path: types.FeaturePath) -> types.FeaturePath: steps = path.steps() return types.FeaturePath(('slice(%s)::' % slice_name + steps[0], ) + steps[1:])
def get_array( table: pa.Table, query_path: types.FeaturePath, broadcast_column_name: Optional[Text] = None ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Retrieve a nested array (and optionally weights) from a table. It assumes all the columns in `table` have only one chunk. It assumes `table` contains only arrays of the following supported types: - list<primitive> - list<struct<[Ts]>> where Ts are the types of the fields in the struct type, and they can only be one of the supported types (recursion intended). If the provided path refers to a leaf in the table, then a ListArray with a primitive element type will be returned. If the provided path does not refer to a leaf, a ListArray with a StructArray element type will be returned. Args: table: The Table whose arrays to be visited. It is assumed that the table contains only one chunk. query_path: The FeaturePath to lookup in the table. broadcast_column_name: The name of a column to broadcast, or None. Each list should contain exactly one value. Returns: A tuple. The first term is the feature array and the second term is the broadcast column array for the feature array (i.e. broadcast_column[i] is the corresponding value for array[i]). Raises: ValueError: When the broadcast column is not a list array or its elements are not 1-element arrays. Or, if copy_broadcast_column is False, an error will be raised if its elements are not of a numeric type. KeyError: When the query_path is empty, or cannot be found in the table and its nested struct arrays. """ def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, weights array_type = array.type if (not pa.types.is_list(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_weights = None if weights is not None: flat_weights = weights[array_util.GetFlattenedArrayParentIndices( array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_weights) if not query_path: raise KeyError('query_path must be non-empty.') column_name = query_path.steps()[0] try: array = table.column(column_name).data.chunk(0) except KeyError: raise KeyError( 'query_path step 0 "{}" not in table.'.format(column_name)) array_path = types.FeaturePath(query_path.steps()[1:]) broadcast_column = None if broadcast_column_name is not None: broadcast_column = np.asarray( get_broadcastable_column(table, broadcast_column_name)) return _recursion_helper(array_path, array, broadcast_column)