示例#1
0
    def testProjection(self):
        """Test projecting of a TFXIO."""
        schema = schema_pb2.Schema()
        schema.CopyFrom(_UNORDERED_SCHEMA)
        tensor_representations = {
            "string_tensor":
            schema_pb2.TensorRepresentation(
                dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                    column_name="string_feature")),
            "float_tensor":
            schema_pb2.TensorRepresentation(
                sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor(
                    dense_shape=schema_pb2.FixedShape(
                        dim=[schema_pb2.FixedShape.Dim(size=10)]),
                    index_column_names=["int_feature"],
                    value_column_name="float_feature")),
        }
        tensor_representation_util.SetTensorRepresentationsInSchema(
            schema, tensor_representations)

        tfxio = ParquetTFXIO(file_pattern=self._example_file,
                             column_names=_COLUMN_NAMES,
                             schema=schema,
                             telemetry_descriptors=_TELEMETRY_DESCRIPTORS)

        projected_tfxio = tfxio.Project(["float_tensor"])

        # The projected_tfxio has the projected schema
        self.assertTrue(projected_tfxio.ArrowSchema().equals(
            _EXPECTED_PROJECTED_ARROW_SCHEMA))

        def _AssertFn(record_batch_list):
            self.assertLen(record_batch_list, 1)
            record_batch = record_batch_list[0]
            self._ValidateRecordBatch(record_batch,
                                      _EXPECTED_PROJECTED_ARROW_SCHEMA)
            expected_schema = projected_tfxio.ArrowSchema()
            self.assertListEqual(
                record_batch.schema.names, expected_schema.names,
                "actual: {}; expected: {}".format(record_batch.schema.names,
                                                  expected_schema.names))
            self.assertListEqual(
                record_batch.schema.types, expected_schema.types,
                "actual: {}; expected: {}".format(record_batch.schema.types,
                                                  expected_schema.types))
            tensor_adapter = projected_tfxio.TensorAdapter()
            dict_of_tensors = tensor_adapter.ToBatchTensors(record_batch)
            self.assertLen(dict_of_tensors, 1)
            self.assertIn("float_tensor", dict_of_tensors)

        pipeline = beam.Pipeline()
        record_batch_pcoll = (pipeline |
                              projected_tfxio.BeamSource(batch_size=_NUM_ROWS))
        beam_testing_util.assert_that(record_batch_pcoll, _AssertFn)
        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
        telemetry_test_util.ValidateMetrics(self, pipeline_result,
                                            _TELEMETRY_DESCRIPTORS, "parquet",
                                            "parquet")
示例#2
0
 def TensorRepresentations(self) -> tensor_adapter.TensorRepresentations:
     return {
         self.raw_record_column_name:
         schema_pb2.TensorRepresentation(
             dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                 column_name=self.raw_record_column_name,
                 shape=schema_pb2.FixedShape(),  # scalar
             ))
     }
def register_list(column_schema, feature):
    if str(column_schema._is_list):
        min_length, max_length = None, None
        if "value_count" in column_schema.properties:
            min_length = column_schema.properties["value_count"]["min"]
            max_length = column_schema.properties["value_count"]["max"]
        if min_length and max_length and min_length == max_length:
            shape = schema_pb2.FixedShape()
            dim = shape.dim.add()
            dim.size = min_length
            feature.shape.CopyFrom(shape)
        elif min_length and max_length and min_length < max_length:
            feature.value_count.CopyFrom(
                schema_pb2.ValueCount(min=min_length, max=max_length))
        else:
            # if no min max available set dummy value, to signal this is list
            feature.value_count.CopyFrom(schema_pb2.ValueCount(min=0, max=0))
    return feature
示例#4
0
def _feature_from_feature_spec(spec, name, domains):
    """Returns a representation of a Feature from a feature spec."""
    if isinstance(spec, tf.io.FixedLenFeature):
        if spec.default_value is not None:
            raise ValueError(
                'feature "{}" had default_value {}, but FixedLenFeature must have '
                'default_value=None'.format(name, spec.default_value))
        dims = [schema_pb2.FixedShape.Dim(size=size) for size in spec.shape]
        feature = schema_pb2.Feature(
            name=name,
            presence=schema_pb2.FeaturePresence(min_fraction=1.0),
            shape=schema_pb2.FixedShape(dim=dims))
    elif isinstance(spec, tf.io.VarLenFeature):
        feature = schema_pb2.Feature(name=name)
    else:
        raise TypeError(
            'Spec for feature "{}" was {} of type {}, expected a '
            'FixedLenFeature, VarLenFeature or SparseFeature'.format(
                name, spec, type(spec)))

    _set_type(name, feature, spec.dtype)
    _set_domain(name, feature, domains.get(name))
    return feature
def _LegacyInferTensorRepresentationFromSchema(
        schema: schema_pb2.Schema
) -> Dict[Text, schema_pb2.TensorRepresentation]:
    """Translate a Feature proto into a TensorRepresentation proto.

  This function applies heuristics to deduce the shape and other information
  from a FeatureProto.  The FeatureProto contains information about the feature
  in an ExampleProto, but the feature spec proto also requires enough
  information to parse the feature into a tensor.  We apply the following rules:

    1. The shape and representation of the column are determined by the
       following rules:
         * if the value_count.min and value_count.max are both 1 then the shape
           is scalar and the representation is fixed length.
         * If value_count.min and value_count.max are equal but greater than 1,
           then the shape is a vector whose length is value_count.max and the
           representation is fixed length.
         * If value_count.min and value_count.max are equal and are less than 1,
           then the shape is a vector of unknown length and the representation
           is variable length.
         * If value_count.min and value_count.max are not equal then
           the shape is a vector of unknown length and the representation is
           variable length.

    2. If the feature is always present or is variable length (based on the
        above rule), no default value is set but if the feature is not always
        present and is fixed length, then a canonical default value is chosen
        based on _LEGACY_DEFAULT_VALUE_FOR_FEATURE_TYPE.

    3. Features that are deprecated are completely ignored and removed.

  Args:
    schema: A Schema proto.

  Returns:
    A Dict mapping tensor names to their TensorRepresentations.

  Raises:
    ValueError: If the feature's type is not supported or the schema is invalid.
  """
    result = {}
    for feature in schema.feature:
        if not _ShouldIncludeFeature(feature):
            continue
        # Infer canonical tensorflow dtype.
        if feature.value_count.min < 0:
            raise ValueError(
                "Feature {} has value_count.min < 0 (value was {}).".format(
                    feature.name, feature.value_count.min))

        if feature.value_count.max < 0:
            raise ValueError(
                "Feature {} has value_count.max < 0 (value was {}).".format(
                    feature.name, feature.value_count.max))

        # Use heuristics to infer the shape and representation.
        if (feature.value_count.min == feature.value_count.max
                and feature.value_count.min == 1):
            # Case 1: value_count.min == value_count.max == 1.  Infer a DenseTensor
            # with rank 0 and a default value.
            logging.info(
                "Feature %s has value_count.min == value_count.max == 1. Setting to "
                "DenseTensor.", feature.name)
            result[feature.name] = schema_pb2.TensorRepresentation(
                dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                    column_name=feature.name,
                    shape=schema_pb2.FixedShape(),
                    default_value=_LegacyInferDefaultValue(feature)))

        elif (feature.value_count.min == feature.value_count.max
              and feature.value_count.min > 1):
            # Case 2: value_count.min == value_count.max > 1.  Infer a DenseTensor
            # with rank 1 and a default value.
            shape = schema_pb2.FixedShape(
                dim=[schema_pb2.FixedShape.Dim(size=feature.value_count.min)])
            logging.info(
                "Feature %s has value_count.min == value_count.max > 1. Setting to "
                "DenseTensor.", feature.name)
            result[feature.name] = schema_pb2.TensorRepresentation(
                dense_tensor=schema_pb2.TensorRepresentation.DenseTensor(
                    column_name=feature.name,
                    shape=shape,
                    default_value=_LegacyInferDefaultValue(feature)))

        else:
            # Case 3: Either value_count.min != value_count.max or
            # value_count.min == value_count.max == 0.  Infer a VarLenSparseTensor.
            logging.info(
                "Feature %s has value_count.min != value_count.max or "
                "value_count.min == value_count.max == 0. "
                "Setting to VarLenSparseTensor.", feature.name)
            result[feature.name] = schema_pb2.TensorRepresentation(
                varlen_sparse_tensor=schema_pb2.TensorRepresentation.
                VarLenSparseTensor(column_name=feature.name))

    return result
def _InferSparseTensorRepresentationsFromSchema(
    schema: schema_pb2.Schema
) -> Tuple[Dict[Text, schema_pb2.TensorRepresentation],
           List[schema_pb2.Feature]]:
    """Infers SparseTensor TensorRepresentation from the given schema."""
    columns_remaining = {f.name: f for f in schema.feature}
    sparse_tensor_representations = {}
    for sparse_feature in schema.sparse_feature:
        if not _ShouldIncludeFeature(sparse_feature):
            continue
        index_keys = [
            index_feature.name
            for index_feature in sparse_feature.index_feature
        ]
        index_features = []
        for index_key in index_keys:
            try:
                index_features.append(columns_remaining.pop(index_key))
            except KeyError:
                raise ValueError(
                    "sparse_feature {} referred to index feature {} which did not "
                    "exist in the schema".format(sparse_feature.name,
                                                 index_key))

        if len(index_features) != 1:
            raise ValueError(
                "sparse_feature {} had rank {} but currently only rank 1"
                " sparse features are supported".format(
                    sparse_feature.name, len(index_features)))

        value_key = sparse_feature.value_feature.name
        try:
            columns_remaining.pop(value_key)
        except KeyError:
            raise ValueError(
                "sparse_feature {} referred to value feature {} which did not "
                "exist in the schema or was referred to as an index or value multiple "
                "times.".format(sparse_feature.name, value_key))

        if index_features[0].HasField("int_domain"):
            # Currently we only handle O-based INT index features whose minimum
            # domain value must be zero.
            if not index_features[0].int_domain.HasField("min"):
                raise ValueError(
                    "Cannot determine dense shape of sparse feature "
                    "{}. The minimum domain value of index feature {}"
                    " is not set.".format(sparse_feature.name, index_keys[0]))
            if index_features[0].int_domain.min != 0:
                raise ValueError(
                    "Only 0-based index features are supported. Sparse "
                    "feature {} has index feature {} whose minimum "
                    "domain value is {}.".format(
                        sparse_feature.name, index_keys[0],
                        index_features[0].int_domain.min))

            if not index_features[0].int_domain.HasField("max"):
                raise ValueError(
                    "Cannot determine dense shape of sparse feature "
                    "{}. The maximum domain value of index feature {}"
                    " is not set.".format(sparse_feature.name, index_keys[0]))
            shape = schema_pb2.FixedShape(dim=[
                schema_pb2.FixedShape.Dim(
                    size=index_features[0].int_domain.max + 1)
            ])
        else:
            raise ValueError(
                "Cannot determine dense shape of sparse feature {}."
                " The index feature {} had no int_domain set.".format(
                    sparse_feature.name, index_keys[0]))

        sparse_tensor_representations[sparse_feature.name] = (
            schema_pb2.TensorRepresentation(
                sparse_tensor=schema_pb2.TensorRepresentation.SparseTensor(
                    dense_shape=shape,
                    index_column_names=index_keys,
                    value_column_name=value_key)))

    return sparse_tensor_representations, list(columns_remaining.values())