예제 #1
0
 def test_is_categorical_features(self):
     schema = text_format.Parse(
         """
     feature {
       name: "fa"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     feature {
       name: "fb"
       type: BYTES
     }
     feature {
       name: "fc"
       type: FLOAT
     }
     feature {
       name: "fa"
       type: INT
     }
     """, schema_pb2.Schema())
     expected = [True, True, False, False]
     self.assertEqual([
         schema_util.is_categorical_feature(feature)
         for feature in schema.feature
     ], expected)
    def _convert_categorical_features_to_numeric(self, df):
        """Encodes all categorical features in input dataframe to numeric values.

    Categorical features are inferred from the schema. They are transformed
    using the np.unique function which maps each value in the feature's domain
    to a numeric id. Encoded categorical features are marked by a boolean mask
    which is returned and used by scikit-learn to identify discrete features.

    Args:
      df: A pd.DataFrame containing feature values where each column corresponds
        to a feature and each row corresponds to an example.

    Returns:
      A boolean list where the ith element is true iff the ith feature column in
      the input df is a categorical feature.
    """
        is_categorical_feature = [False for _ in df]

        for i, column in enumerate(df):
            if schema_util.is_categorical_feature(
                    schema_util.get_feature(self._schema, column)):
                # Encode categorical columns
                df[column] = np.unique(df[column].values,
                                       return_inverse=True)[1]
                is_categorical_feature[i] = True
        return is_categorical_feature
    def _impute(self, examples):
        """Imputes missing feature values.

    Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
    for categorical features and 10*max(feature_values) for numeric features.
    We impute missing values with an extreme value that is far from observed
    values so it does not incorrectly impact KNN results. 10*max(feature_values)
    is used instead of sys.max_float because max_float is large enough to cause
    unexpected float arithmetic errors.

    Args:
      examples: A dict where the key is the feature name and the values are the
        feature values.

    Returns:
      A dict where the key is the feature name and the values are the
        feature values with missing values imputed.
    """

        for feature, feature_values in examples.items():
            if schema_util.is_categorical_feature(
                    schema_util.get_feature(self._schema, feature)):
                imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
            else:
                imputation_fill_value = max(value for value in feature_values
                                            if value is not None) * 10
            examples[feature] = [
                value if value is not None else imputation_fill_value
                for value in feature_values
            ]
        return examples
예제 #4
0
    def __init__(self, schema: schema_pb2.Schema, y_path: types.FeaturePath,
                 x_paths: Optional[Iterable[types.FeaturePath]],
                 y_boundaries: Optional[Iterable[float]], min_x_count: int,
                 top_k_per_y: Optional[int], bottom_k_per_y: Optional[int],
                 name: Text) -> None:
        """Initializes a lift statistics generator.

    Args:
      schema: A required schema for the dataset.
      y_path: The path to use as Y in the lift expression:
        lift = P(Y=y|X=x) / P(Y=y).
      x_paths: An optional list of path to use as X in the lift expression:
        lift = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features,
        exluding the feature passed as y_path, will be used.
      y_boundaries: An optional list of boundaries to be used for binning
        y_path. If provided with b boundaries, the binned values will be treated
        as a categorical feature with b+1 different values. For example, the
        y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1),
        [0.1, 0.8) and [0.8, inf].
      min_x_count: The minimum number of examples in which a specific x value
        must appear, in order for its lift to be output.
      top_k_per_y:  Optionally, the number of top x values per y value, ordered
        by descending lift, for which to output lift. If both top_k_per_y and
        bottom_k_per_y are unset, all values will be output.
      bottom_k_per_y:  Optionally, the number of bottom x values per y value,
        ordered by descending lift, for which to output lift. If both
        top_k_per_y and bottom_k_per_y are unset, all values will be output.
      name: An optional unique name associated with the statistics generator.
    """
        self._name = name
        self._schema = schema
        self._y_path = y_path
        self._min_x_count = min_x_count
        self._top_k_per_y = top_k_per_y
        self._bottom_k_per_y = bottom_k_per_y

        y_feature = schema_util.get_feature(schema, y_path)
        y_is_categorical = schema_util.is_categorical_feature(y_feature)
        if y_boundaries:
            if y_is_categorical:
                raise ValueError(
                    'Boundaries cannot be applied to a categorical y_path')
            self._y_boundaries = np.array(sorted(set(y_boundaries)))
        else:
            if not y_is_categorical:
                raise ValueError(
                    'Boundaries must be provided with a non-categorical '
                    'y_path.')
            self._y_boundaries = y_boundaries
        if x_paths is None:
            self._x_paths = (
                set(schema_util.get_categorical_features(schema)) -
                set([y_path]))
        else:
            self._x_paths = x_paths
    def __init__(self, label_feature, schema, seed):
        """Initializes SkLearnMutualInformation.

    Args:
      label_feature: The key used to identify labels in the ExampleBatch.
      schema: The schema of the dataset.
      seed: An int value to seed the RNG used in MI computation.

    Raises:
      ValueError: If label_feature does not exist in the schema.
    """
        self._label_feature = label_feature
        self._schema = schema
        self._label_feature_is_categorical = schema_util.is_categorical_feature(
            schema_util.get_feature(self._schema, self._label_feature))
        self._seed = seed

        # Seed the RNG used for shuffling and for MI computations.
        np.random.seed(seed)
예제 #6
0
    def __init__(self, y_path: types.FeaturePath,
                 schema: Optional[schema_pb2.Schema],
                 x_paths: Optional[Iterable[types.FeaturePath]],
                 y_boundaries: Optional[Sequence[float]], min_x_count: int,
                 top_k_per_y: Optional[int], bottom_k_per_y: Optional[int],
                 weight_column_name: Optional[Text], output_custom_stats: bool,
                 name: Text) -> None:
        """Initializes a lift statistics generator.

    Args:
      y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) /
        P(Y=y).
     schema: An optional schema for the dataset. If not provided, x_paths must
       be specified. If x_paths are not specified, the schema is used to
       identify all categorical columns for which Lift should be computed.
      x_paths: An optional list of path to use as X in the lift expression: lift
        = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features,
        exluding the feature passed as y_path, will be used.
      y_boundaries: An optional list of boundaries to be used for binning
        y_path. If provided with b boundaries, the binned values will be treated
        as a categorical feature with b+1 different values. For example, the
        y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1),
          [0.1, 0.8) and [0.8, inf].
      min_x_count: The minimum number of examples in which a specific x value
        must appear, in order for its lift to be output.
      top_k_per_y: Optionally, the number of top x values per y value, ordered
        by descending lift, for which to output lift. If both top_k_per_y and
        bottom_k_per_y are unset, all values will be output.
      bottom_k_per_y: Optionally, the number of bottom x values per y value,
        ordered by descending lift, for which to output lift. If both
        top_k_per_y and bottom_k_per_y are unset, all values will be output.
      weight_column_name: Optionally, a weight column to use for converting
        counts of x or y into weighted counts.
      output_custom_stats: Whether to output custom stats for use with Facets.
      name: An optional unique name associated with the statistics generator.
    """
        self._name = name
        self._schema = schema
        self._y_path = y_path
        self._min_x_count = min_x_count
        self._top_k_per_y = top_k_per_y
        self._bottom_k_per_y = bottom_k_per_y
        self._output_custom_stats = output_custom_stats
        self._y_boundaries = (np.array(sorted(set(y_boundaries)))
                              if y_boundaries else None)
        self._weight_column_name = weight_column_name

        # If a schema is provided, we can do some additional validation of the
        # provided y_feature and boundaries.
        if self._schema is not None:
            y_feature = schema_util.get_feature(self._schema, y_path)
            y_is_categorical = schema_util.is_categorical_feature(y_feature)
            if self._y_boundaries is not None:
                if y_is_categorical:
                    raise ValueError(
                        'Boundaries cannot be applied to a categorical y_path')
            else:
                if not y_is_categorical:
                    raise ValueError(
                        'Boundaries must be provided with a non-categorical '
                        'y_path.')
        if x_paths is not None:
            self._x_paths = x_paths
        elif self._schema is not None:
            self._x_paths = (
                set(schema_util.get_categorical_features(schema)) -
                set([y_path]))
        else:
            raise ValueError('Either a schema or x_paths must be provided.')