Exemplo n.º 1
0
  def extract_output(self,
                     accumulator: CrossFeatureStatsGeneratorAccumulator
                    ) -> statistics_pb2.DatasetFeatureStatistics:
    # Create a new DatasetFeatureStatistics proto.
    result = statistics_pb2.DatasetFeatureStatistics()

    for feat_cross, cross_feat_stats in accumulator.items():
      # Construct the CrossFeatureStatistics proto from the partial
      # cross feature stats.
      cross_feat_stats_proto = result.cross_features.add()
      path_x = path_pb2.Path()
      path_x.step.append(feat_cross[0])
      path_y = path_pb2.Path()
      path_y.step.append(feat_cross[1])
      cross_feat_stats_proto.path_x.CopyFrom(path_x)
      cross_feat_stats_proto.path_y.CopyFrom(path_y)
      cross_feat_stats_proto.count = cross_feat_stats.count
      if cross_feat_stats.count > 0:
        num_cross_stats_proto = statistics_pb2.NumericCrossStatistics()
        covariance = (cross_feat_stats.sum_xy / cross_feat_stats.count) -\
            (cross_feat_stats.sum_x / cross_feat_stats.count) *\
            (cross_feat_stats.sum_y / cross_feat_stats.count)
        num_cross_stats_proto.covariance = covariance
        std_dev_x = math.sqrt(max(
            0, (cross_feat_stats.sum_square_x / cross_feat_stats.count) -
            math.pow(cross_feat_stats.sum_x / cross_feat_stats.count, 2)))
        std_dev_y = math.sqrt(max(
            0, (cross_feat_stats.sum_square_y / cross_feat_stats.count) -
            math.pow(cross_feat_stats.sum_y / cross_feat_stats.count, 2)))
        if std_dev_x != 0 and std_dev_y != 0:
          correlation = covariance / (std_dev_x * std_dev_y)
          num_cross_stats_proto.correlation = correlation
        cross_feat_stats_proto.num_cross_stats.CopyFrom(num_cross_stats_proto)

    return result
Exemplo n.º 2
0
def _ragged_tensor_representation_from_feature_spec(
    spec: common_types.RaggedFeature, name: str,
    domains: Dict[str, common_types.DomainType]
) -> Tuple[schema_pb2.Feature, List[schema_pb2.Feature],
           schema_pb2.TensorRepresentation]:
    """Returns representation of a RaggedTensor from a feature spec.

  Args:
    spec: A tf.io.RaggedFeature feature spec.
    name: Feature name.
    domains: A dict whose keys are feature names and values are one of
      schema_pb2.IntDomain, schema_pb2.StringDomain or schema_pb2.FloatDomain.

  Returns:
    A tuple (value_feature, partitions_features, ragged_tensor_rep),
      where value_feature represents RaggedTensor values, partitions_features
      represent row lengths partitions and ragged_tensor_rep - ragged
      TensorRepresentation.

  Raises:
    ValueError: If the feature spec contains partition types different from
      UniformRowLength and RowLengths.
  """
    value_feature = schema_pb2.Feature(name=spec.value_key or name)
    _set_type(name, value_feature, spec.dtype)
    _set_domain(name, value_feature, domains.get(name))

    ragged_tensor = schema_pb2.TensorRepresentation.RaggedTensor(
        feature_path=path_pb2.Path(step=[spec.value_key or name]))

    partitions_features = []
    for partition in spec.partitions:
        if isinstance(partition, tf.io.RaggedFeature.UniformRowLength):  # pytype: disable=attribute-error
            ragged_tensor.partition.append(
                schema_pb2.TensorRepresentation.RaggedTensor.Partition(
                    uniform_row_length=partition.length))
        elif isinstance(partition, tf.io.RaggedFeature.RowLengths):  # pytype: disable=attribute-error
            ragged_tensor.partition.append(
                schema_pb2.TensorRepresentation.RaggedTensor.Partition(
                    row_length=partition.key))
            partitions_features.append(
                schema_pb2.Feature(name=partition.key, type=schema_pb2.INT))
        else:
            raise ValueError(
                'RaggedFeature can only be created with UniformRowLength and '
                'RowLengths partitions.')

    return value_feature, partitions_features, schema_pb2.TensorRepresentation(
        ragged_tensor=ragged_tensor)
Exemplo n.º 3
0
  def as_proto(self):
    """Serialize a path as a proto.

    This fails if there are any anonymous fields.

    Returns:
      a Path proto.
    """
    result = tf_metadata_path_pb2.Path()
    for x in self.field_list:
      if isinstance(x, str):
        result.step.append(x)
      elif isinstance(x, AnonymousId):
        raise ValueError("Cannot serialize a path with anonymous fields")
      else:
        raise ValueError("Unexpected path element type: %s" % type(x))
    return result
Exemplo n.º 4
0
 def to_proto(self) -> path_pb2.Path:
     return path_pb2.Path(step=self._steps)
Exemplo n.º 5
0
 def to_proto(self) -> path_pb2.Path:
     """Creates a tensorflow_metadata path proto this ColumnPath."""
     return path_pb2.Path(step=self._steps)