Пример #1
0
def _remove_unsupported_feature_columns(examples_table: pa.Table,
                                        schema: schema_pb2.Schema) -> pa.Table:
  """Removes feature columns that contain unsupported values.

  All feature columns that are multivalent are dropped since they are
  not supported by sk-learn.

  All columns of STRUCT type are also dropped.

  Args:
    examples_table: Arrow table containing a batch of examples.
    schema: The schema for the data.

  Returns:
    Arrow table.
  """
  multivalent_features = schema_util.get_multivalent_features(schema)
  unsupported_columns = set()
  for f in multivalent_features:
    unsupported_columns.add(f.steps()[0])
  for column_name, column in zip(examples_table.schema.names,
                                 examples_table.itercolumns()):
    if (stats_util.get_feature_type_from_arrow_type(
        types.FeaturePath([column_name]),
        column.type) == statistics_pb2.FeatureNameStatistics.STRUCT):
      unsupported_columns.add(column_name)
  return examples_table.drop(unsupported_columns)
def _flatten_and_impute(
    examples_table: pa.Table, categorical_features: Set[types.FeaturePath]
) -> Dict[types.FeaturePath, np.ndarray]:
    """Flattens and imputes the values in the input Arrow table.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples_table: Arrow table containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
    num_rows = examples_table.num_rows
    result = {}
    for feature_column in examples_table.itercolumns():
        feature_path = types.FeaturePath([feature_column.name])
        # Assume we have only a single chunk.
        feature_array = feature_column.data.chunk(0)
        # to_pandas returns a readonly array. Create a copy as we will be imputing
        # the NaN values.
        non_missing_values = np.copy(
            arrow_util.primitive_array_to_numpy(feature_array.flatten()))
        non_missing_parent_indices = arrow_util.primitive_array_to_numpy(
            arrow_util.GetFlattenedArrayParentIndices(feature_array))
        is_categorical_feature = feature_path in categorical_features
        result_dtype = non_missing_values.dtype
        if non_missing_parent_indices.size < num_rows and is_categorical_feature:
            result_dtype = np.object
        flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
        num_values = arrow_util.primitive_array_to_numpy(
            arrow_util.ListLengthsFromListArray(feature_array))
        missing_parent_indices = np.where(num_values == 0)[0]
        if feature_path in categorical_features:
            imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
        else:
            # Also impute any NaN values.
            nan_mask = np.isnan(non_missing_values)
            imputation_fill_value = sys.maxsize
            if not np.all(nan_mask):
                imputation_fill_value = non_missing_values[~nan_mask].max(
                ) * 10
            non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value
        flattened_array[non_missing_parent_indices] = non_missing_values
        if missing_parent_indices.any():
            flattened_array[missing_parent_indices] = imputation_fill_value
        result[feature_path] = flattened_array
    return result
Пример #3
0
def enumerate_arrays(
    table: pa.Table, weight_column: Optional[Text], enumerate_leaves_only: bool
) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
    """Enumerates arrays in a Table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  It enumerates each column (i.e. array, because there is only one chunk) in
  the table (also see `enumerate_leaves_only`) If an array is of type
  list<struct<[Ts]>>, then it flattens the outermost list, then enumerates the
  array of each field in the result struct<[Ts]> array, and continues
  recursively. The weights get "aligned" automatically in this process,
  therefore weights, the third term in the returned tuple always has array[i]'s
  weight being weights[i].

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    weight_column: The name of the weight column, or None. The elements of
      the weight column should be lists of numerics, and each list should
      contain only one value.
    enumerate_leaves_only: If True, only enumerate "leaf" arrays.
      Otherwise, also enumerate the struct arrays where the leaf arrays are
      contained.

  Yields:
    A tuple. The first term is the path of the feature; the second term is
    the feature array and the third term is the weight array for the feature
    array (i.e. weights[i] is the weight for array[i]).

  Raises:
    ValueError: When the weight column is not a list array whose elements are
      1-element lists.
  """
    def _recursion_helper(
        feature_path: types.FeaturePath, array: pa.Array,
        weights: Optional[np.ndarray]
    ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
        """Recursion helper."""
        array_type = array.type
        if (pa.types.is_list(array_type)
                and pa.types.is_struct(array_type.value_type)):
            if not enumerate_leaves_only:
                yield (feature_path, array, weights)
            flat_struct_array = array.flatten()
            flat_weights = None
            if weights is not None:
                flat_weights = weights[
                    array_util.GetFlattenedArrayParentIndices(
                        array).to_numpy()]
            for field in flat_struct_array.type:
                field_name = field.name
                # use "yield from" after PY 3.3.
                for e in _recursion_helper(feature_path.child(field_name),
                                           flat_struct_array.field(field_name),
                                           flat_weights):
                    yield e
        else:
            yield (feature_path, array, weights)

    weights = None
    if weight_column is not None:
        weights = get_broadcastable_column(table, weight_column)
        weight_type = weights.type
        if pa.types.is_string(weight_type) or pa.types.is_binary(weight_type):
            raise ValueError(
                'Weight column "{}" must be of numeric type. Found {}.'.format(
                    weight_column, weight_type))
        weights = np.asarray(weights)
    for column_name, column in zip(table.schema.names, table.itercolumns()):
        # use "yield from" after PY 3.3.
        for e in _recursion_helper(types.FeaturePath([column_name]),
                                   column.data.chunk(0), weights):
            yield e