예제 #1
0
    def get_features_spec(self):
        """
        Define the features spec from the feature_config.
        This will be used to parse the serialized TFRecord

        Returns
        -------
        dict
            Feature specification dictionary that can be used to parse
            Context features from the serialized SequenceExample
        dict
            Feature specification dictionary that can be used to parse
            Sequence features (or feature lists) from the serialized SequenceExample
        """

        context_features_spec = dict()
        sequence_features_spec = dict()

        for feature_info in self.feature_config.get_all_features():
            if feature_info.get("name") == self.feature_config.get_mask(
                    "name"):
                continue
            serving_info = feature_info["serving_info"]
            if not self.required_fields_only or serving_info.get(
                    "required", feature_info["trainable"]):
                feature_name = feature_info["name"]
                dtype = feature_info["dtype"]
                default_value = self.feature_config.get_default_value(
                    feature_info)
                if feature_info[
                        "tfrecord_type"] == SequenceExampleTypeKey.CONTEXT:
                    context_features_spec[feature_name] = io.FixedLenFeature(
                        [], dtype, default_value=default_value)
                elif feature_info[
                        "tfrecord_type"] == SequenceExampleTypeKey.SEQUENCE:
                    sequence_features_spec[feature_name] = io.VarLenFeature(
                        dtype=dtype)
                else:
                    raise KeyError("Invalid SequenceExample type: {}".format(
                        feature_info["tfrecord_type"]))

        return context_features_spec, sequence_features_spec
예제 #2
0
    def get_features_spec(self):
        """
        Define the features spec from the feature_config.
        This will be used to parse the serialized TFRecord

        Returns
        -------
        dict
            feature specification dictionary that can be used to parse TFRecords
        """
        features_spec = dict()

        for feature_info in self.feature_config.get_all_features():
            serving_info = feature_info["serving_info"]
            if not self.required_fields_only or serving_info.get(
                    "required", feature_info["trainable"]) or feature_info["trainable"]:
                feature_name = feature_info["name"]
                dtype = feature_info["dtype"]
                default_value = self.feature_config.get_default_value(feature_info)
                features_spec[feature_name] = io.FixedLenFeature(
                    [], dtype, default_value=default_value)

        return features_spec
예제 #3
0
def make_example_parse_fn(
    feature_config: FeatureConfig,
    preprocessing_map: PreprocessingMap,
    required_fields_only: bool = False,
) -> tf.function:
    """
    Create a parse function using the Example features spec

    Parameters
    ----------
    feature_config : `FeatureConfig`
        FeatureConfig object defining context and sequence feature information
    preprocessing_map : `PreprocessingMap` object
        map of preprocessing feature functions
    required_fields_only : bool, optional
        Whether to only use required fields from the feature_config

    Returns
    -------
    `tf.function`
        Parsing function that takes in a serialized Example message and extracts a feature dictionary
    """

    features_spec = dict()

    for feature_info in feature_config.get_all_features():
        serving_info = feature_info["serving_info"]
        if not required_fields_only or serving_info.get("required", feature_info["trainable"]):
            feature_name = feature_info["name"]
            dtype = feature_info["dtype"]
            default_value = feature_config.get_default_value(feature_info)
            features_spec[feature_name] = io.FixedLenFeature(
                [], dtype, default_value=default_value
            )
    print(features_spec)

    @tf.function
    def _parse_example_fn(example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Parameters
        ----------
        example_proto : string
            serialized tfrecord Example protobuf message

        Returns
        -------
        features : dict
            parsed features as `tf.Tensor` objects extracted from the protobuf
        labels : `tf.Tensor`
            parsed label as a `tf.Tensor` object extracted from the protobuf
        """
        features = io.parse_single_example(serialized=example_proto, features=features_spec)

        features_dict = dict()

        # Process all features, including label.
        for feature_info in feature_config.get_all_features():
            feature_node_name = feature_info.get("node_name", feature_info["name"])

            default_tensor = tf.constant(
                value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"],
            )
            feature_tensor = features.get(feature_info["name"], default_tensor)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)

            feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        return features_dict, labels

    return _parse_example_fn
예제 #4
0
def make_sequence_example_parse_fn(
    feature_config: FeatureConfig,
    preprocessing_map: PreprocessingMap,
    max_sequence_size: int = 25,
    required_fields_only: bool = False,
    pad_sequence: bool = True,
) -> tf.function:
    """
    Create a parse function using the SequenceExample features spec

    Parameters
    ----------
    feature_config : `FeatureConfig`
        FeatureConfig object defining context and sequence feature information
    preprocessing_map : int
        map of preprocessing feature functions
    max_sequence_size : int
        Maximum number of sequence per query. Used for padding
    required_fields_only : bool, optional
        Whether to only use required fields from the feature_config
    pad_sequence : bool
        Whether to pad sequence

    Returns
    -------
    `tf.function`
        Parsing function that takes in a serialized SequenceExample message
        and extracts a feature dictionary for context and sequence features
    """

    context_features_spec = dict()
    sequence_features_spec = dict()

    for feature_info in feature_config.get_all_features():
        serving_info = feature_info["serving_info"]
        if not required_fields_only or serving_info.get("required", feature_info["trainable"]):
            feature_name = feature_info["name"]
            dtype = feature_info["dtype"]
            default_value = feature_config.get_default_value(feature_info)
            if feature_info["tfrecord_type"] == SequenceExampleTypeKey.CONTEXT:
                context_features_spec[feature_name] = io.FixedLenFeature(
                    [], dtype, default_value=default_value
                )
            elif feature_info["tfrecord_type"] == SequenceExampleTypeKey.SEQUENCE:
                sequence_features_spec[feature_name] = io.VarLenFeature(dtype=dtype)

    @tf.function
    def _parse_sequence_example_fn(sequence_example_proto):
        """
        Parse the input `tf.SequenceExample` proto using the features_spec

        Parameters
        ----------
        sequence_example_proto : string
            serialized tfrecord SequenceExample protobuf message

        Returns
        -------
        features : dict
            parsed features as `tf.Tensor` objects extracted from the protobuf
        labels : `tf.Tensor`
            parsed label as a `tf.Tensor` object extracted from the protobuf
        """
        context_features, sequence_features = io.parse_single_sequence_example(
            serialized=sequence_example_proto,
            context_features=context_features_spec,
            sequence_features=sequence_features_spec,
        )

        features_dict = dict()

        # Handle context features
        for feature_info in feature_config.get_context_features():
            feature_node_name = feature_info.get("node_name", feature_info["name"])

            default_tensor = tf.constant(
                value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"],
            )
            feature_tensor = context_features.get(feature_info["name"], default_tensor)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)

            # Preprocess features
            feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        # Define mask to identify padded sequence
        if required_fields_only and not feature_config.get_rank("serving_info")["required"]:
            """
            Define dummy mask if the rank field is not a required field for serving

            NOTE:
            This masks all max_sequence_size as 1 as there is no real way to know
            the number of sequence in the query. There is no predefined required field,
            and hence we would need to do a full pass of all features to find the record shape.
            This approach might be unstable if different features have different shapes.

            Hence we just mask all sequence
            """
            features_dict["mask"] = tf.constant(
                value=1, shape=[max_sequence_size], dtype=feature_config.get_rank("dtype")
            )
            sequence_size = tf.constant(max_sequence_size, dtype=tf.int64)
        else:
            # Typically used at training time, to pad/clip to a fixed number of sequence per query

            # Use rank as a reference tensor to infer shape/sequence_size in query
            reference_tensor = sequence_features.get(feature_config.get_rank(key="node_name"))

            # Add mask for identifying padded sequence
            mask = tf.ones_like(sparse.to_dense(sparse.reset_shape(reference_tensor)))
            sequence_size = tf.cast(tf.reduce_sum(mask), tf.int64)

            if pad_sequence:
                mask = tf.expand_dims(mask, axis=-1)

                def crop_fn():
                    tf.print("\n[WARN] Bad query found. Number of sequence : ", tf.shape(mask)[1])
                    return image.crop_to_bounding_box(
                        mask,
                        offset_height=0,
                        offset_width=0,
                        target_height=1,
                        target_width=max_sequence_size,
                    )

                mask = tf.cond(
                    tf.shape(mask)[1] <= max_sequence_size,
                    # Pad if there are missing sequence
                    lambda: image.pad_to_bounding_box(
                        mask,
                        offset_height=0,
                        offset_width=0,
                        target_height=1,
                        target_width=max_sequence_size,
                    ),
                    # Crop if there are extra sequence
                    crop_fn,
                )
                mask = tf.squeeze(mask)
            else:
                mask = tf.squeeze(mask, axis=0)

            # Check validity of mask
            tf.debugging.assert_greater(sequence_size, tf.constant(0, dtype=tf.int64))

            features_dict["mask"] = mask
            sequence_size = max_sequence_size if pad_sequence else sequence_size

        # Pad sequence features to max_sequence_size
        for feature_info in feature_config.get_sequence_features():
            feature_node_name = feature_info.get("node_name", feature_info["name"])

            default_tensor = tf.fill(
                value=tf.constant(
                    value=feature_config.get_default_value(feature_info),
                    dtype=feature_info["dtype"],
                ),
                dims=[max_sequence_size if pad_sequence else sequence_size],
            )
            feature_tensor = sequence_features.get(feature_info["name"], default_tensor)

            if isinstance(feature_tensor, sparse.SparseTensor):
                feature_tensor = sparse.reset_shape(
                    feature_tensor,
                    new_shape=[1, max_sequence_size if pad_sequence else sequence_size],
                )
                feature_tensor = sparse.to_dense(feature_tensor)
                feature_tensor = tf.squeeze(feature_tensor, axis=0)

            # Preprocess features
            feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        return features_dict, labels

    return _parse_sequence_example_fn
예제 #5
0
def make_example_parse_fn(
    feature_config: FeatureConfig,
    preprocessing_map: PreprocessingMap,
    required_fields_only: bool = False,
) -> tf.function:
    """
    Create a parse function using the Example features spec

    Args:
        feature_config: FeatureConfig object defining context and sequence features
        max_sequence_size: Maximum number of sequence per query. Used for padding.
        required_fields_only: Whether to only use required fields from the feature_config
        pad_sequence: Whether to pad sequence
    """

    features_spec = dict()

    for feature_info in feature_config.get_all_features():
        serving_info = feature_info["serving_info"]
        if not required_fields_only or serving_info.get(
                "required", feature_info["trainable"]):
            feature_name = feature_info["name"]
            dtype = feature_info["dtype"]
            default_value = feature_config.get_default_value(feature_info)
            features_spec[feature_name] = io.FixedLenFeature(
                [], dtype, default_value=default_value)
    print(features_spec)

    @tf.function
    def _parse_example_fn(example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Args:
            example_proto: tfrecord Example protobuf data

        Returns:
            features: parsed features extracted from the protobuf
            labels: parsed label extracted from the protobuf
        """
        features = io.parse_single_example(serialized=example_proto,
                                           features=features_spec)

        features_dict = dict()

        # Process all features, including label.
        for feature_info in feature_config.get_all_features():
            feature_node_name = feature_info.get("node_name",
                                                 feature_info["name"])

            default_tensor = tf.constant(
                value=feature_config.get_default_value(feature_info),
                dtype=feature_info["dtype"],
            )
            feature_tensor = features.get(feature_info["name"], default_tensor)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)

            feature_tensor = preprocess_feature(feature_tensor, feature_info,
                                                preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        return features_dict, labels

    return _parse_example_fn
예제 #6
0
 def get_template():
     return dict(image=io.FixedLenFeature([], 'string'),
                 label=io.FixedLenFeature([], 'int64'))
예제 #7
0
def make_parse_fn(feature_config: FeatureConfig,
                  max_num_records: int = 25) -> tf.function:
    """Create a parse function using the context and sequence features spec"""

    context_features_spec = dict()
    sequence_features_spec = dict()

    for feature_info in feature_config.get_all_features():
        feature_name = feature_info["name"]
        feature_node_name = feature_info.get("node_name", feature_name)
        dtype = tf.float32
        default_value: Optional[Union[float, str]] = None
        if feature_info["dtype"] == "float":
            dtype = tf.float32
            default_value = 0.0
        elif feature_info["dtype"] == "int":
            dtype = tf.int64
            default_value = 0
        elif feature_info["dtype"] == "bytes":
            dtype = tf.string
            default_value = ""
        else:
            raise Exception("Unknown dtype {} for {}".format(
                feature_info["dtype"], feature_name))
        if feature_info["tfrecord_type"] == TFRecordTypeKey.CONTEXT:
            context_features_spec[feature_node_name] = io.FixedLenFeature(
                [], dtype, default_value=default_value)
        elif feature_info["tfrecord_type"] == TFRecordTypeKey.SEQUENCE:
            sequence_features_spec[feature_node_name] = io.VarLenFeature(
                dtype=dtype)

    @tf.function
    def _parse_sequence_example_fn(sequence_example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Args:
            sequence_example_proto: tfrecord SequenceExample protobuf data

        Returns:
            features: parsed features extracted from the protobuf
            labels: parsed label extracted from the protobuf
        """
        context_features, sequence_features = io.parse_single_sequence_example(
            serialized=sequence_example_proto,
            context_features=context_features_spec,
            sequence_features=sequence_features_spec,
        )

        features_dict = dict()

        # Explode context features into all records
        for feature_info in feature_config.get_context_features():
            feature_node_name = feature_info.get("node_name",
                                                 feature_info["name"])
            feature_layer_info = feature_info.get("feature_layer_info")

            feature_tensor = context_features.get(feature_node_name)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)
            feature_tensor = tf.tile(feature_tensor,
                                     multiples=[max_num_records])

            # If feature is a string, then decode into numbers
            if feature_layer_info["type"] == FeatureTypeKey.STRING:
                feature_tensor = io.decode_raw(
                    feature_tensor,
                    out_type=tf.uint8,
                    fixed_length=feature_layer_info["max_length"],
                )
                feature_tensor = tf.cast(feature_tensor, tf.float32)

            features_dict[feature_node_name] = feature_tensor

        # Pad sequence features to max_num_records
        for feature_info in feature_config.get_sequence_features():
            feature_node_name = feature_info.get("node_name",
                                                 feature_info["name"])
            feature_layer_info = feature_info["feature_layer_info"]

            feature_tensor = sequence_features.get(feature_node_name)

            if isinstance(feature_tensor, sparse.SparseTensor):
                if feature_node_name == feature_config.get_rank(
                        key="node_name"):
                    # Add mask for identifying padded records
                    mask = tf.ones_like(
                        sparse.to_dense(sparse.reset_shape(feature_tensor)))
                    mask = tf.expand_dims(mask, axis=2)

                    def crop_fn():
                        tf.print(
                            "\n[WARN] Bad query found. Number of records : ",
                            tf.shape(mask)[1])
                        return image.crop_to_bounding_box(
                            mask,
                            offset_height=0,
                            offset_width=0,
                            target_height=1,
                            target_width=max_num_records,
                        )

                    mask = tf.cond(
                        tf.shape(mask)[1] < max_num_records,
                        # Pad if there are missing records
                        lambda: image.pad_to_bounding_box(
                            mask,
                            offset_height=0,
                            offset_width=0,
                            target_height=1,
                            target_width=max_num_records,
                        ),
                        # Crop if there are extra records
                        crop_fn,
                    )
                    mask = tf.squeeze(mask)

                    # Check validity of mask
                    tf.debugging.assert_greater(
                        tf.cast(tf.reduce_sum(mask), tf.float32),
                        tf.constant(0.0))

                    features_dict["mask"] = mask

                feature_tensor = sparse.reset_shape(
                    feature_tensor, new_shape=[1, max_num_records])
                feature_tensor = sparse.to_dense(feature_tensor)
                feature_tensor = tf.squeeze(feature_tensor)

                # If feature is a string, then decode into numbers
                if feature_layer_info["type"] == FeatureTypeKey.STRING:
                    feature_tensor = io.decode_raw(
                        feature_tensor,
                        out_type=tf.uint8,
                        fixed_length=feature_layer_info["max_length"],
                    )
                    feature_tensor = tf.cast(feature_tensor, tf.float32)
            else:
                raise ValueError("Invalid input : {}".format(feature_name))

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        # Check if label is one-hot and correctly masked
        tf.debugging.assert_equal(tf.cast(tf.reduce_sum(labels), tf.float32),
                                  tf.constant(1.0))

        return features_dict, labels

    return _parse_sequence_example_fn
예제 #8
0
def make_parse_fn(feature_config: Features,
                  max_num_records: int = 25) -> tf.function:
    """Create a parse function using the context and sequence features spec"""

    context_features_spec = dict()
    sequence_features_spec = dict()

    for feature, feature_info in feature_config.get_dict().items():
        # FIXME(ashish) - without this next guard we break if there are masks.
        if "node_name" in feature_info and feature_info["node_name"] == "mask":
            continue
        tfrecord_info = feature_info["tfrecord_info"]
        dtype = tf.float32
        default_value: Optional[Union[float, str]] = None
        if tfrecord_info["dtype"] == "float":
            dtype = tf.float32
            default_value = 0.0
        elif tfrecord_info["dtype"] == "int":
            dtype = tf.int64
            default_value = 0
        elif tfrecord_info["dtype"] == "bytes":
            dtype = tf.string
            default_value = ""
        else:
            raise Exception("Unknown dtype {} for {}".format(
                tfrecord_info["dtype"], feature))
        if tfrecord_info["type"] == TFRecordTypeKey.CONTEXT:
            context_features_spec[feature] = io.FixedLenFeature(
                [], dtype, default_value=default_value)
        elif tfrecord_info["type"] == TFRecordTypeKey.SEQUENCE:
            sequence_features_spec[feature] = io.VarLenFeature(dtype=dtype)

    @tf.function
    def _parse_sequence_example_fn(sequence_example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Args:
            sequence_example_proto: tfrecord SequenceExample protobuf data

        Returns:
            TODO(ashish): note - "features" is not a Features object.  It's a {feat_name: tf.Tensor} mapping
            (so perhaps a bad name?)
            features: parsed features extracted from the protobuf
            labels: parsed label extracted from the protobuf
        """
        context, examples = io.parse_single_sequence_example(
            serialized=sequence_example_proto,
            context_features=context_features_spec,
            sequence_features=sequence_features_spec,
        )

        features = dict()

        # Explode context features into all records
        for feat, t in context.items():
            t = tf.expand_dims(t, axis=0)
            t = tf.tile(t, multiples=[max_num_records])

            # If feature is a string, then decode into numbers
            if feature_config.get_dict(
            )[feat]["type"] == FeatureTypeKey.STRING:
                t = io.decode_raw(
                    t,
                    out_type=tf.uint8,
                    fixed_length=feature_config.get_dict()[feat]["max_length"],
                )
                t = tf.cast(t, tf.float32)

            features[feat] = t

        # Pad sequence features to max_num_records
        for feat, t in examples.items():
            if isinstance(t, sparse.SparseTensor):
                if feat == "pos":
                    # Add mask for identifying padded records
                    mask = tf.ones_like(sparse.to_dense(sparse.reset_shape(t)))
                    mask = tf.expand_dims(mask, axis=2)
                    mask = image.pad_to_bounding_box(
                        mask,
                        offset_height=0,
                        offset_width=0,
                        target_height=1,
                        target_width=max_num_records,
                    )
                    features["mask"] = tf.squeeze(mask)

                t = sparse.reset_shape(t, new_shape=[1, max_num_records])
                t = sparse.to_dense(t)
                t = tf.squeeze(t)

                # If feature is a string, then decode into numbers
                if feature_config.get_dict(
                )[feat]["type"] == FeatureTypeKey.STRING:
                    t = io.decode_raw(
                        t,
                        out_type=tf.uint8,
                        fixed_length=feature_config.get_dict()[feat]
                        ["max_length"],
                    )
                    t = tf.cast(t, tf.float32)
            else:
                #
                # Handle dense tensors
                #
                # if len(t.shape) == 1:
                #     t = tf.expand_dims(t, axis=0)
                # if len(t.shape) == 2:
                #     t = tf.pad(t, paddings=[[0, 0], [0, max_num_records]])
                #     t = tf.squeeze(t)
                # else:
                #     raise Exception('Invalid input : {}'.format(feat))
                raise ValueError("Invalid input : {}".format(feat))

            features[feat] = t

        labels = features.pop(feature_config.label)
        return features, labels

    return _parse_sequence_example_fn