示例#1
0
  def _recursion_helper(
      query_path: types.FeaturePath, array: pa.Array,
      example_indices: Optional[np.ndarray]
  ) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Recursion helper."""
    array_type = array.type
    if not query_path:
      if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
        array = array_util.ToSingletonListArray(array)
      return array, example_indices
    if not pa.types.is_struct(get_innermost_nested_type(array_type)):
      raise KeyError('Cannot process query_path "{}" inside an array of type '
                     '{}. Expecting a struct<...> or '
                     '(large_)list...<struct<...>>.'.format(
                         query_path, array_type))
    flat_struct_array, parent_indices = flatten_nested(
        array, example_indices is not None)
    flat_indices = None
    if example_indices is not None:
      flat_indices = example_indices[parent_indices]

    step = query_path.steps()[0]
    try:
      child_array = flat_struct_array.field(step)
    except KeyError:
      raise KeyError('query_path step "{}" not in struct.'.format(step))
    relative_path = types.FeaturePath(query_path.steps()[1:])
    return _recursion_helper(relative_path, child_array, flat_indices)
示例#2
0
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not is_list_like(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a (large_)list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)
def _get_sparse_feature(
        schema: schema_pb2.Schema,
        feature_path: types.FeaturePath) -> schema_pb2.SparseFeature:
    """Returns a sparse feature from the schema."""
    if not isinstance(schema, schema_pb2.Schema):
        raise TypeError('schema is of type %s, should be a Schema proto.' %
                        type(schema).__name__)

    feature_container = None
    parent = feature_path.parent()
    if parent:
        # Sparse features do not have a struct_domain and so can be only leaves.
        # Thus, we can assume that all parent steps are features, not sparse
        # features.
        feature_container = schema.feature
        for step in parent.steps():
            f = schema_util.look_up_feature(step, feature_container)
            if f is None:
                raise ValueError('Feature %s not found in the schema.' %
                                 feature_path)
            if f.type != schema_pb2.STRUCT:
                raise ValueError(
                    'Step %s in feature %s does not refer to a valid STRUCT feature'
                    % (step, feature_path))
            feature_container = f.struct_domain.sparse_feature

    if feature_container is None:
        feature_container = schema.sparse_feature
    feature = _look_up_sparse_feature(feature_path.steps()[-1],
                                      feature_container)
    if feature is None:
        raise ValueError('Sparse Feature %s not found in the schema.' %
                         feature_path)
    return feature
示例#4
0
def _PartitionTransform(pcol, row_partitions: int, column_partitions: int,
                        label_feature: types.FeaturePath, seed: int):
    """Ptransform wrapping _default_assign_to_partition."""
    # We need to find the column name associated with the label path.
    steps = label_feature.steps()
    if not steps:
        raise ValueError("Empty label feature")
    label = steps[0]
    return pcol | "PartitionRowsCols" >> beam.ParDo(
        _PartitionFn(row_partitions, column_partitions, label, seed))
示例#5
0
def get_array(
        table: pa.Table, query_path: types.FeaturePath,
        return_example_indices: bool) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Retrieve a nested array (and optionally example indices) from a table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the table, then a ListArray with a
  primitive element type will be returned. If the provided path does not refer
  to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    query_path: The FeaturePath to lookup in the table.
    return_example_indices: Whether to return an additional array containing the
      example indices of the elements in the array corresponding to the
      query_path.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    example_indeices array for the feature array (i.e. array[i] came from the
    example at row example_indices[i] in the table.).

  Raises:
    KeyError: When the query_path is empty, or cannot be found in the table and
      its nested struct arrays.
  """
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not is_list_like(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a (large_)list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)

    if not query_path:
        raise KeyError('query_path must be non-empty.')
    column_name = query_path.steps()[0]
    try:
        array = table.column(column_name).data.chunk(0)
    except KeyError:
        raise KeyError(
            'query_path step 0 "{}" not in table.'.format(column_name))
    array_path = types.FeaturePath(query_path.steps()[1:])

    example_indices = np.arange(
        table.num_rows) if return_example_indices else None
    return _recursion_helper(array_path, array, example_indices)
示例#6
0
def get_array(
    record_batch: pa.RecordBatch,
    query_path: types.FeaturePath,
    return_example_indices: bool,
    wrap_flat_struct_in_list: bool = True,
) -> Tuple[pa.Array, Optional[np.ndarray]]:
  """Retrieve a nested array (and optionally example indices) from RecordBatch.

  This function has the same assumption over `record_batch` as
  `enumerate_arrays()` does.

  If the provided path refers to a leaf in the `record_batch`, then a
  "nested_list" will be returned. If the provided path does not refer to a leaf,
  a "struct" with be returned.

  See `enumerate_arrays()` for definition of "nested_list" and "struct".

  Args:
    record_batch: The RecordBatch whose arrays to be visited.
    query_path: The FeaturePath to lookup in the record_batch.
    return_example_indices: Whether to return an additional array containing the
      example indices of the elements in the array corresponding to the
      query_path.
    wrap_flat_struct_in_list: if True, and if the query_path leads to a
      struct<[Ts]> array, it will be wrapped in a list array, where each
      sub-list contains one element. Caller can make use of this option to
      assume this function always returns a list<inner_type>.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    example_indeices array for the feature array (i.e. array[i] came from the
    example at row example_indices[i] in the record_batch.).

  Raises:
    KeyError: When the query_path is empty, or cannot be found in the
    record_batch and its nested struct arrays.
  """

  def _recursion_helper(
      query_path: types.FeaturePath, array: pa.Array,
      example_indices: Optional[np.ndarray]
  ) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Recursion helper."""
    array_type = array.type
    if not query_path:
      if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
        array = array_util.ToSingletonListArray(array)
      return array, example_indices
    if not pa.types.is_struct(get_innermost_nested_type(array_type)):
      raise KeyError('Cannot process query_path "{}" inside an array of type '
                     '{}. Expecting a struct<...> or '
                     '(large_)list...<struct<...>>.'.format(
                         query_path, array_type))
    flat_struct_array, parent_indices = flatten_nested(
        array, example_indices is not None)
    flat_indices = None
    if example_indices is not None:
      flat_indices = example_indices[parent_indices]

    step = query_path.steps()[0]
    try:
      child_array = flat_struct_array.field(step)
    except KeyError:
      raise KeyError('query_path step "{}" not in struct.'.format(step))
    relative_path = types.FeaturePath(query_path.steps()[1:])
    return _recursion_helper(relative_path, child_array, flat_indices)

  if not query_path:
    raise KeyError('query_path must be non-empty.')
  column_name = query_path.steps()[0]
  field_index = record_batch.schema.get_field_index(column_name)
  if field_index < 0:
    raise KeyError('query_path step 0 "{}" not in record batch.'
                   .format(column_name))
  array = record_batch.column(field_index)
  array_path = types.FeaturePath(query_path.steps()[1:])

  example_indices = np.arange(
      record_batch.num_rows) if return_example_indices else None
  return _recursion_helper(array_path, array, example_indices)
def _prepend_slice_path(slice_name: str,
                        path: types.FeaturePath) -> types.FeaturePath:
    steps = path.steps()
    return types.FeaturePath(('slice(%s)::' % slice_name + steps[0], ) +
                             steps[1:])
def get_array(
    table: pa.Table,
    query_path: types.FeaturePath,
    broadcast_column_name: Optional[Text] = None
) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Retrieve a nested array (and optionally weights) from a table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the table, then a ListArray with a
  primitive element type will be returned. If the provided path does not refer
  to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    query_path: The FeaturePath to lookup in the table.
    broadcast_column_name: The name of a column to broadcast, or None. Each list
      should contain exactly one value.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    broadcast column array for the feature array (i.e. broadcast_column[i] is
    the corresponding value for array[i]).

  Raises:
    ValueError: When the broadcast column is not a list array or its elements
      are not 1-element arrays. Or, if copy_broadcast_column is False, an error
      will be raised if its elements are not of a numeric type.
    KeyError: When the query_path is empty, or cannot be found in the table and
      its nested struct arrays.
  """
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        weights: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, weights
        array_type = array.type
        if (not pa.types.is_list(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_weights = None
        if weights is not None:
            flat_weights = weights[array_util.GetFlattenedArrayParentIndices(
                array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_weights)

    if not query_path:
        raise KeyError('query_path must be non-empty.')
    column_name = query_path.steps()[0]
    try:
        array = table.column(column_name).data.chunk(0)
    except KeyError:
        raise KeyError(
            'query_path step 0 "{}" not in table.'.format(column_name))
    array_path = types.FeaturePath(query_path.steps()[1:])

    broadcast_column = None
    if broadcast_column_name is not None:
        broadcast_column = np.asarray(
            get_broadcastable_column(table, broadcast_column_name))
    return _recursion_helper(array_path, array, broadcast_column)