Пример #1
0
def generate_partial_statistics_in_memory(
    table: pa.Table, options: stats_options.StatsOptions,
    stats_generators: List[stats_generator.CombinerStatsGenerator]
) -> List[Any]:
    """Generates statistics for an in-memory list of examples.

  Args:
    table: Arrow table.
    options: Options for generating data statistics.
    stats_generators: A list of combiner statistics generators.

  Returns:
    A list of accumulators containing partial statistics.
  """
    result = []

    if options.feature_whitelist:
        whitelisted_columns = [
            table.column(f) for f in options.feature_whitelist
        ]
        table = pa.Table.from_arrays(whitelisted_columns)
    for generator in stats_generators:
        result.append(
            generator.add_input(generator.create_accumulator(), table))

    return result
Пример #2
0
def get_broadcastable_column(input_table: pa.Table,
                             column_name: Text) -> pa.Array:
    """Gets a column from the input table, validating that it can be broadcast.

  Args:
    input_table: Input table.
    column_name: Name of the column to be retrieved and validated.
      This column must refer to a ListArray in which each list has length 1.

  Returns:
    An arrow array containing a flattened view of the broadcast column.

  Raises:
    ValueError: If the broadcast feature is not present in the input table or is
        not a valid column. A valid column must have exactly one value per
        example and be of a numeric type. If copy_array is True, the numeric
        type constraint is relaxed.
  """
    try:
        column = input_table.column(column_name).data.chunk(0)
    except KeyError:
        raise ValueError(
            'Column "{}" not present in the input table.'.format(column_name))

    # Before flattening, check that there is a single value for each example.
    column_lengths = array_util.ListLengthsFromListArray(column).to_numpy()
    if not np.all(column_lengths == 1):
        raise ValueError(
            'Column "{}" must have exactly one value in each example.'.format(
                column_name))
    return column.flatten()
Пример #3
0
 def add_input(self, accumulator: List[float],
               examples_table: pa.Table) -> List[float]:
     accumulator[0] += examples_table.num_rows
     if self._weight_feature:
         weights_column = examples_table.column(self._weight_feature)
         for weight_array in weights_column.data.iterchunks():
             accumulator[1] += np.sum(np.asarray(weight_array.flatten()))
     return accumulator
Пример #4
0
def _filter_features(table: pa.Table,
                     feature_whitelist: List[types.FeatureName]) -> pa.Table:
    """Removes features that are not whitelisted.

  Args:
    table: Input Arrow table.
    feature_whitelist: A set of feature names to whitelist.

  Returns:
    An Arrow table containing only the whitelisted features of the input table.
  """
    column_names = set(table.schema.names)
    columns_to_select = []
    for feature_name in feature_whitelist:
        if feature_name in column_names:
            columns_to_select.append(table.column(feature_name))
    return pa.Table.from_arrays(columns_to_select)
Пример #5
0
def _get_weight_feature(input_table: pa.Table,
                        weight_feature: Text) -> np.ndarray:
    """Gets the weight column from the input table.

  Args:
    input_table: Input table.
    weight_feature: Name of the weight feature.

  Returns:
    A numpy array containing the weights of the examples in the input table.

  Raises:
    ValueError: If the weight feature is not present in the input table or is
        not a valid weight feature (must be of numeric type and have a
        single value for each example).
  """
    try:
        weights = input_table.column(weight_feature).data.chunk(0)
    except KeyError:
        raise ValueError('Weight feature "{}" not present in the input '
                         'table.'.format(weight_feature))

    # Before flattening, check that there is a single value for each example.
    weight_lengths = ListLengthsFromListArray(weights).to_numpy()
    if not np.all(weight_lengths == 1):
        raise ValueError(
            'Weight feature "{}" must have exactly one value in each example.'.
            format(weight_feature))
    weights = weights.flatten()
    # Before converting to numpy view, check the type (cannot convert string and
    # binary arrays to numpy view).
    weights_type = weights.type
    if pa.types.is_string(weights_type) or pa.types.is_binary(weights_type):
        raise ValueError(
            'Weight feature "{}" must be of numeric type. Found {}.'.format(
                weight_feature, weights_type))
    return weights.to_numpy()
Пример #6
0
def enumerate_arrays(
    table: pa.Table, weight_column: Optional[Text], enumerate_leaves_only: bool
) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
    """Enumerates arrays in a Table.

  It assumes all the columns in `table` has only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  It enumerates each column (i.e. array, because there is only one chunk) in
  the table (also see `enumerate_leaves_only`) If an array is of type
  list<struct<[Ts]>>, then it flattens the outermost list, then enumerates the
  array of each field in the result struct<[Ts]> array, and continues
  recursively. The weights get "aligned" automatically in this process,
  therefore weights, the third term in the returned tuple always has array[i]'s
  weight being weights[i].

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    weight_column: The name of the weight column, or None. The elements of
      the weight column should be lists of numerics, and each list should
      contain only one value.
    enumerate_leaves_only: If True, only enumerate "leaf" arrays.
      Otherwise, also enumerate the struct arrays where the leaf arrays are
      contained.

  Yields:
    A tuple. The first term is the path of the feature; the second term is
    the feature array and the third term is the weight array for the feature
    array (i.e. weights[i] is the weight for array[i]).

  Raises:
    ValueError: When the weight column is not a list array whose elements are
      not 1-element lists.
  """
    def _recursion_helper(
        feature_path: types.FeaturePath, array: pa.Array,
        weights: Optional[np.ndarray]
    ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
        """Recursion helper."""
        array_type = array.type
        if (pa.types.is_list(array_type)
                and pa.types.is_struct(array_type.value_type)):
            if not enumerate_leaves_only:
                yield (feature_path, array, weights)
            flat_struct_array = array.flatten()
            flat_weights = None
            if weights is not None:
                flat_weights = weights[GetFlattenedArrayParentIndices(
                    array).to_numpy()]
            for field in flat_struct_array.type:
                field_name = field.name
                # use "yield from" after PY 3.3.
                for e in _recursion_helper(feature_path.child(field_name),
                                           flat_struct_array.field(field_name),
                                           flat_weights):
                    yield e
        else:
            yield (feature_path, array, weights)

    weights = None
    if weight_column is not None:
        weights = table.column(weight_column).data.chunk(
            0).flatten().to_numpy()
        if weights.size != table.num_rows:
            raise ValueError(
                'The weight feature must have exactly one value in each example'
            )
    for column in table.columns:
        column_name = column.name
        # use "yield from" after PY 3.3.
        for e in _recursion_helper(types.FeaturePath([column_name]),
                                   column.data.chunk(0), weights):
            yield e
Пример #7
0
def get_array(
        table: pa.Table, query_path: types.FeaturePath,
        return_example_indices: bool) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Retrieve a nested array (and optionally example indices) from a table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the table, then a ListArray with a
  primitive element type will be returned. If the provided path does not refer
  to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    query_path: The FeaturePath to lookup in the table.
    return_example_indices: Whether to return an additional array containing the
      example indices of the elements in the array corresponding to the
      query_path.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    example_indeices array for the feature array (i.e. array[i] came from the
    example at row example_indices[i] in the table.).

  Raises:
    KeyError: When the query_path is empty, or cannot be found in the table and
      its nested struct arrays.
  """
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not pa.types.is_list(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)

    if not query_path:
        raise KeyError('query_path must be non-empty.')
    column_name = query_path.steps()[0]
    try:
        array = table.column(column_name).data.chunk(0)
    except KeyError:
        raise KeyError(
            'query_path step 0 "{}" not in table.'.format(column_name))
    array_path = types.FeaturePath(query_path.steps()[1:])

    example_indices = np.arange(
        table.num_rows) if return_example_indices else None
    return _recursion_helper(array_path, array, example_indices)
Пример #8
0
def get_array(
    table: pa.Table,
    query_path: types.FeaturePath,
    broadcast_column_name: Optional[Text] = None
) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Retrieve a nested array (and optionally weights) from a table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the table, then a ListArray with a
  primitive element type will be returned. If the provided path does not refer
  to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    query_path: The FeaturePath to lookup in the table.
    broadcast_column_name: The name of a column to broadcast, or None. Each list
      should contain exactly one value.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    broadcast column array for the feature array (i.e. broadcast_column[i] is
    the corresponding value for array[i]).

  Raises:
    ValueError: When the broadcast column is not a list array or its elements
      are not 1-element arrays. Or, if copy_broadcast_column is False, an error
      will be raised if its elements are not of a numeric type.
    KeyError: When the query_path is empty, or cannot be found in the table and
      its nested struct arrays.
  """
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        weights: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, weights
        array_type = array.type
        if (not pa.types.is_list(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_weights = None
        if weights is not None:
            flat_weights = weights[array_util.GetFlattenedArrayParentIndices(
                array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_weights)

    if not query_path:
        raise KeyError('query_path must be non-empty.')
    column_name = query_path.steps()[0]
    try:
        array = table.column(column_name).data.chunk(0)
    except KeyError:
        raise KeyError(
            'query_path step 0 "{}" not in table.'.format(column_name))
    array_path = types.FeaturePath(query_path.steps()[1:])

    broadcast_column = None
    if broadcast_column_name is not None:
        broadcast_column = np.asarray(
            get_broadcastable_column(table, broadcast_column_name))
    return _recursion_helper(array_path, array, broadcast_column)