Пример #1
0
def Load_TFDV(df):

    lencols = len(df.columns)
    # print(lencols)
    y_tfdv = [0] * lencols

    i = 0
    for col in df.columns:
        # print(col)
        df_col = df[[col]]
        st_option = tfdv.StatsOptions(enable_semantic_domain_stats=True)
        stats = tfdv.generate_statistics_from_dataframe(
            df_col, stats_options=st_option)
        schema = tfdv.infer_schema(statistics=stats)
        categ_lst = get_categorical_features(schema)
        for x in categ_lst:
            y_tfdv[i] = 1
            break

        xc = schema.feature
        # print(xc)
        for x in xc:
            cnt_NLD = str(x).count('natural_language_domain')
            cnt_TD = str(x).count('time_domain')

            if cnt_NLD: y_tfdv[i] = 3
            if cnt_TD: y_tfdv[i] = 2
        print(y_tfdv[i])
        i = i + 1

    return y_tfdv
Пример #2
0
 def test_get_categorical_features(self):
     schema = text_format.Parse(
         """
     feature {
       name: "fa"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     feature {
       name: "fb"
       type: BYTES
     }
     feature {
       name: "fc"
       type: FLOAT
     }
     feature {
       name: "fd"
       type: INT
     }
     """, schema_pb2.Schema())
     expected = set(['fa', 'fb'])
     self.assertEqual(schema_util.get_categorical_features(schema),
                      expected)
    def __init__(self, label_feature: types.FeaturePath,
                 schema: schema_pb2.Schema, seed: int):
        """Initializes SkLearnMutualInformation.

    Args:
      label_feature: The key used to identify labels in the ExampleBatch.
      schema: The schema of the dataset.
      seed: An int value to seed the RNG used in MI computation.

    Raises:
      ValueError: If label_feature does not exist in the schema.
    """
        self._label_feature = label_feature
        self._schema = schema
        self._categorical_features = schema_util.get_categorical_features(
            schema)
        assert schema_util.get_feature(self._schema, self._label_feature)
        self._label_feature_is_categorical = (self._label_feature
                                              in self._categorical_features)
        self._seed = seed
        self._schema_features = set([
            feature_path
            for (feature_path, _) in schema_util.get_all_leaf_features(schema)
        ])

        # Seed the RNG used for shuffling and for MI computations.
        np.random.seed(seed)
Пример #4
0
    def __init__(self, schema: schema_pb2.Schema, y_path: types.FeaturePath,
                 x_paths: Optional[Iterable[types.FeaturePath]],
                 y_boundaries: Optional[Iterable[float]], min_x_count: int,
                 top_k_per_y: Optional[int], bottom_k_per_y: Optional[int],
                 name: Text) -> None:
        """Initializes a lift statistics generator.

    Args:
      schema: A required schema for the dataset.
      y_path: The path to use as Y in the lift expression:
        lift = P(Y=y|X=x) / P(Y=y).
      x_paths: An optional list of path to use as X in the lift expression:
        lift = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features,
        exluding the feature passed as y_path, will be used.
      y_boundaries: An optional list of boundaries to be used for binning
        y_path. If provided with b boundaries, the binned values will be treated
        as a categorical feature with b+1 different values. For example, the
        y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1),
        [0.1, 0.8) and [0.8, inf].
      min_x_count: The minimum number of examples in which a specific x value
        must appear, in order for its lift to be output.
      top_k_per_y:  Optionally, the number of top x values per y value, ordered
        by descending lift, for which to output lift. If both top_k_per_y and
        bottom_k_per_y are unset, all values will be output.
      bottom_k_per_y:  Optionally, the number of bottom x values per y value,
        ordered by descending lift, for which to output lift. If both
        top_k_per_y and bottom_k_per_y are unset, all values will be output.
      name: An optional unique name associated with the statistics generator.
    """
        self._name = name
        self._schema = schema
        self._y_path = y_path
        self._min_x_count = min_x_count
        self._top_k_per_y = top_k_per_y
        self._bottom_k_per_y = bottom_k_per_y

        y_feature = schema_util.get_feature(schema, y_path)
        y_is_categorical = schema_util.is_categorical_feature(y_feature)
        if y_boundaries:
            if y_is_categorical:
                raise ValueError(
                    'Boundaries cannot be applied to a categorical y_path')
            self._y_boundaries = np.array(sorted(set(y_boundaries)))
        else:
            if not y_is_categorical:
                raise ValueError(
                    'Boundaries must be provided with a non-categorical '
                    'y_path.')
            self._y_boundaries = y_boundaries
        if x_paths is None:
            self._x_paths = (
                set(schema_util.get_categorical_features(schema)) -
                set([y_path]))
        else:
            self._x_paths = x_paths
 def test_get_categorical_features(self):
     schema = text_format.Parse(
         """
     feature {
       name: "fa"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     feature {
       name: "fb"
       type: BYTES
     }
     feature {
       name: "fc"
       type: FLOAT
     }
     feature {
       name: "fd"
       type: INT
     }
     feature {
       name: "fd"
       type: STRUCT
       struct_domain {
         feature {
           name: "fd_fa"
           type: INT
           int_domain {
             is_categorical: true
           }
         }
         feature {
           name: "fd_fb"
         }
       }
     }
     feature {
       name: "fe"
       type: FLOAT
       float_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     expected = set([
         types.FeaturePath(['fa']),
         types.FeaturePath(['fb']),
         types.FeaturePath(['fd', 'fd_fa']),
         types.FeaturePath(['fe']),
     ])
     self.assertEqual(schema_util.get_categorical_features(schema),
                      expected)
Пример #6
0
    def __init__(self, y_path: types.FeaturePath,
                 schema: Optional[schema_pb2.Schema],
                 x_paths: Optional[Iterable[types.FeaturePath]],
                 y_boundaries: Optional[Sequence[float]], min_x_count: int,
                 top_k_per_y: Optional[int], bottom_k_per_y: Optional[int],
                 weight_column_name: Optional[Text], output_custom_stats: bool,
                 name: Text) -> None:
        """Initializes a lift statistics generator.

    Args:
      y_path: The path to use as Y in the lift expression: lift = P(Y=y|X=x) /
        P(Y=y).
     schema: An optional schema for the dataset. If not provided, x_paths must
       be specified. If x_paths are not specified, the schema is used to
       identify all categorical columns for which Lift should be computed.
      x_paths: An optional list of path to use as X in the lift expression: lift
        = P(Y=y|X=x) / P(Y=y). If None (default), all categorical features,
        exluding the feature passed as y_path, will be used.
      y_boundaries: An optional list of boundaries to be used for binning
        y_path. If provided with b boundaries, the binned values will be treated
        as a categorical feature with b+1 different values. For example, the
        y_boundaries value [0.1, 0.8] would lead to three buckets: [-inf, 0.1),
          [0.1, 0.8) and [0.8, inf].
      min_x_count: The minimum number of examples in which a specific x value
        must appear, in order for its lift to be output.
      top_k_per_y: Optionally, the number of top x values per y value, ordered
        by descending lift, for which to output lift. If both top_k_per_y and
        bottom_k_per_y are unset, all values will be output.
      bottom_k_per_y: Optionally, the number of bottom x values per y value,
        ordered by descending lift, for which to output lift. If both
        top_k_per_y and bottom_k_per_y are unset, all values will be output.
      weight_column_name: Optionally, a weight column to use for converting
        counts of x or y into weighted counts.
      output_custom_stats: Whether to output custom stats for use with Facets.
      name: An optional unique name associated with the statistics generator.
    """
        self._name = name
        self._schema = schema
        self._y_path = y_path
        self._min_x_count = min_x_count
        self._top_k_per_y = top_k_per_y
        self._bottom_k_per_y = bottom_k_per_y
        self._output_custom_stats = output_custom_stats
        self._y_boundaries = (np.array(sorted(set(y_boundaries)))
                              if y_boundaries else None)
        self._weight_column_name = weight_column_name

        # If a schema is provided, we can do some additional validation of the
        # provided y_feature and boundaries.
        if self._schema is not None:
            y_feature = schema_util.get_feature(self._schema, y_path)
            y_is_categorical = schema_util.is_categorical_feature(y_feature)
            if self._y_boundaries is not None:
                if y_is_categorical:
                    raise ValueError(
                        'Boundaries cannot be applied to a categorical y_path')
            else:
                if not y_is_categorical:
                    raise ValueError(
                        'Boundaries must be provided with a non-categorical '
                        'y_path.')
        if x_paths is not None:
            self._x_paths = x_paths
        elif self._schema is not None:
            self._x_paths = (
                set(schema_util.get_categorical_features(schema)) -
                set([y_path]))
        else:
            raise ValueError('Either a schema or x_paths must be provided.')
Пример #7
0
    def __init__(self,
                 label_feature: types.FeaturePath,
                 schema: Optional[schema_pb2.Schema] = None,
                 max_encoding_length: int = 512,
                 seed: int = 12345,
                 multivalent_features: Optional[Set[types.FeaturePath]] = None,
                 categorical_features: Optional[Set[types.FeaturePath]] = None,
                 features_to_ignore: Optional[Set[types.FeaturePath]] = None,
                 normalize_by_max: bool = False,
                 allow_invalid_partitions: bool = False,
                 custom_stats_key: str = _ADJUSTED_MUTUAL_INFORMATION_KEY,
                 column_partitions: int = 1):
        """Initializes MutualInformation.

    Args:
      label_feature: The key used to identify labels in the ExampleBatch.
      schema: An optional schema describing the the dataset. Either a schema or
        a list of categorical and multivalent features must be provided.
      max_encoding_length: An int value to specify the maximum length of
        encoding to represent a feature value.
      seed: An int value to seed the RNG used in MI computation.
      multivalent_features: An optional set of features that are multivalent.
      categorical_features: An optional set of the features that are
        categorical.
      features_to_ignore: An optional set of features that should be ignored by
        the mutual information calculation.
      normalize_by_max: If True, AMI values are normalized to a range 0 to 1 by
        dividing by the maximum possible information AMI(Y, Y).
      allow_invalid_partitions: If True, generator tolerates input partitions
        that are invalid (e.g. size of partion is < the k for the KNN), where
        invalid partitions return no stats. The min_partitions_stat_presence arg
        to PartitionedStatisticsAnalyzer controls how many partitions may be
        invalid while still reporting the metric.
      custom_stats_key: A string that determines the key used in the custom
        statistic. This defaults to `_ADJUSTED_MUTUAL_INFORMATION_KEY`.
      column_partitions: If > 1, self.partitioner returns a PTransform that
        partitions input RecordBatches by column (feature), in addition to the
        normal row partitioning (by batch). The total number of effective
        partitions is column_partitions * row_partitions, where row_partitions
        is passed to self.partitioner.

    Raises:
      ValueError: If label_feature does not exist in the schema.
    """
        self._label_feature = label_feature
        self._schema = schema
        self._normalize_by_max = normalize_by_max
        if multivalent_features is not None:
            self._multivalent_features = multivalent_features
        elif self._schema is not None:
            self._multivalent_features = schema_util.get_multivalent_features(
                self._schema)
        else:
            raise ValueError(
                "Either multivalent feature set or schema must be provided")
        if categorical_features is not None:
            self._categorical_features = categorical_features
        elif self._schema is not None:
            self._categorical_features = schema_util.get_categorical_features(
                self._schema)
        else:
            raise ValueError(
                "Either categorical feature set or schema must be provided")
        if schema:
            assert schema_util.get_feature(self._schema, self._label_feature)
        self._label_feature_is_categorical = (self._label_feature
                                              in self._categorical_features)
        self._max_encoding_length = max_encoding_length
        self._seed = seed
        self._features_to_ignore = features_to_ignore
        self._allow_invalid_partitions = allow_invalid_partitions
        self._custom_stats_key = custom_stats_key
        self._column_partitions = column_partitions