예제 #1
0
def write_from_df(
    df: DataFrame,
    tfrecord_file: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    logger: Logger = None,
):
    """
    Converts data from CSV files into tfrecord files

    Parameters
    df : `pd.DataFrame`
        pandas DataFrame to be converted to TFRecordDataset
    tfrecord_file : str
        tfrecord file path to write the output
    feature_config : `FeatureConfig`
        FeatureConfig object that defines the features to be loaded in the dataset
        and the preprocessing functions to be applied to each of them
    tfrecord_type : {"example", "sequence_example"}
        Type of the TFRecord protobuf message to be used for TFRecordDataset
    logger : `Logger`, optional
        logging handler for status messages
    """

    if logger:
        logger.info(
            "Writing SequenceExample protobufs to : {}".format(tfrecord_file))
    with io.TFRecordWriter(tfrecord_file) as tf_writer:
        if tfrecord_type == TFRecordTypeKey.EXAMPLE:
            protos = df.apply(
                lambda row: get_example_proto(
                    row=row, features=feature_config.get_all_features()),
                axis=1,
            )
        elif tfrecord_type == TFRecordTypeKey.SEQUENCE_EXAMPLE:
            # Group pandas dataframe on query_id/query key and
            # convert each group to a single sequence example proto
            context_feature_names = feature_config.get_context_features(
                key="name")
            protos = df.groupby(context_feature_names).apply(
                lambda g: get_sequence_example_proto(
                    group=g,
                    context_features=feature_config.get_context_features(),
                    sequence_features=feature_config.get_sequence_features(),
                ))
        else:
            raise Exception("You have entered {} as tfrecords write mode. "
                            "We only support {} and {}.".format(
                                tfrecord_type, TFRecordTypeKey.EXAMPLE,
                                TFRecordTypeKey.SEQUENCE_EXAMPLE))
        # Write to disk
        for proto in protos:
            tf_writer.write(proto.SerializeToString())
예제 #2
0
def write_from_df(
    df: DataFrame,
    tfrecord_file: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    logger: Logger = None,
):
    """
    Converts data from CSV files into tfrecord data.
    Output data protobuf format -> train.SequenceExample

    Args:
        df: pandas DataFrame
        tfrecord_file: tfrecord file path to write the output
        feature_config: str path to YAML feature config or str YAML feature config
        tfrecord_type: TFRecordTypeKey.EXAMPLE or TFRecordTypeKey.SEQUENCE_EXAMPLE
        logger: logging object

    NOTE: This method should be moved out of ml4ir and into the preprocessing pipeline
    """

    if logger:
        logger.info(
            "Writing SequenceExample protobufs to : {}".format(tfrecord_file))
    with io.TFRecordWriter(tfrecord_file) as tf_writer:
        if tfrecord_type == TFRecordTypeKey.EXAMPLE:
            protos = df.apply(
                lambda row: get_example_proto(
                    row=row, features=feature_config.get_all_features()),
                axis=1,
            )
        elif tfrecord_type == TFRecordTypeKey.SEQUENCE_EXAMPLE:
            # Group pandas dataframe on query_id/query key and
            # convert each group to a single sequence example proto
            context_feature_names = feature_config.get_context_features(
                key="name")
            protos = df.groupby(context_feature_names).apply(
                lambda g: get_sequence_example_proto(
                    group=g,
                    context_features=feature_config.get_context_features(),
                    sequence_features=feature_config.get_sequence_features(),
                ))
        else:
            raise Exception("You have entered {} as tfrecords write mode. "
                            "We only support {} and {}.".format(
                                tfrecord_type, TFRecordTypeKey.EXAMPLE,
                                TFRecordTypeKey.SEQUENCE_EXAMPLE))
        # Write to disk
        for proto in protos:
            tf_writer.write(proto.SerializeToString())
예제 #3
0
def make_example_parse_fn(
    feature_config: FeatureConfig,
    preprocessing_map: PreprocessingMap,
    required_fields_only: bool = False,
) -> tf.function:
    """
    Create a parse function using the Example features spec

    Parameters
    ----------
    feature_config : `FeatureConfig`
        FeatureConfig object defining context and sequence feature information
    preprocessing_map : `PreprocessingMap` object
        map of preprocessing feature functions
    required_fields_only : bool, optional
        Whether to only use required fields from the feature_config

    Returns
    -------
    `tf.function`
        Parsing function that takes in a serialized Example message and extracts a feature dictionary
    """

    features_spec = dict()

    for feature_info in feature_config.get_all_features():
        serving_info = feature_info["serving_info"]
        if not required_fields_only or serving_info.get("required", feature_info["trainable"]):
            feature_name = feature_info["name"]
            dtype = feature_info["dtype"]
            default_value = feature_config.get_default_value(feature_info)
            features_spec[feature_name] = io.FixedLenFeature(
                [], dtype, default_value=default_value
            )
    print(features_spec)

    @tf.function
    def _parse_example_fn(example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Parameters
        ----------
        example_proto : string
            serialized tfrecord Example protobuf message

        Returns
        -------
        features : dict
            parsed features as `tf.Tensor` objects extracted from the protobuf
        labels : `tf.Tensor`
            parsed label as a `tf.Tensor` object extracted from the protobuf
        """
        features = io.parse_single_example(serialized=example_proto, features=features_spec)

        features_dict = dict()

        # Process all features, including label.
        for feature_info in feature_config.get_all_features():
            feature_node_name = feature_info.get("node_name", feature_info["name"])

            default_tensor = tf.constant(
                value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"],
            )
            feature_tensor = features.get(feature_info["name"], default_tensor)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)

            feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        return features_dict, labels

    return _parse_example_fn
예제 #4
0
def make_sequence_example_parse_fn(
    feature_config: FeatureConfig,
    preprocessing_map: PreprocessingMap,
    max_sequence_size: int = 25,
    required_fields_only: bool = False,
    pad_sequence: bool = True,
) -> tf.function:
    """
    Create a parse function using the SequenceExample features spec

    Parameters
    ----------
    feature_config : `FeatureConfig`
        FeatureConfig object defining context and sequence feature information
    preprocessing_map : int
        map of preprocessing feature functions
    max_sequence_size : int
        Maximum number of sequence per query. Used for padding
    required_fields_only : bool, optional
        Whether to only use required fields from the feature_config
    pad_sequence : bool
        Whether to pad sequence

    Returns
    -------
    `tf.function`
        Parsing function that takes in a serialized SequenceExample message
        and extracts a feature dictionary for context and sequence features
    """

    context_features_spec = dict()
    sequence_features_spec = dict()

    for feature_info in feature_config.get_all_features():
        serving_info = feature_info["serving_info"]
        if not required_fields_only or serving_info.get("required", feature_info["trainable"]):
            feature_name = feature_info["name"]
            dtype = feature_info["dtype"]
            default_value = feature_config.get_default_value(feature_info)
            if feature_info["tfrecord_type"] == SequenceExampleTypeKey.CONTEXT:
                context_features_spec[feature_name] = io.FixedLenFeature(
                    [], dtype, default_value=default_value
                )
            elif feature_info["tfrecord_type"] == SequenceExampleTypeKey.SEQUENCE:
                sequence_features_spec[feature_name] = io.VarLenFeature(dtype=dtype)

    @tf.function
    def _parse_sequence_example_fn(sequence_example_proto):
        """
        Parse the input `tf.SequenceExample` proto using the features_spec

        Parameters
        ----------
        sequence_example_proto : string
            serialized tfrecord SequenceExample protobuf message

        Returns
        -------
        features : dict
            parsed features as `tf.Tensor` objects extracted from the protobuf
        labels : `tf.Tensor`
            parsed label as a `tf.Tensor` object extracted from the protobuf
        """
        context_features, sequence_features = io.parse_single_sequence_example(
            serialized=sequence_example_proto,
            context_features=context_features_spec,
            sequence_features=sequence_features_spec,
        )

        features_dict = dict()

        # Handle context features
        for feature_info in feature_config.get_context_features():
            feature_node_name = feature_info.get("node_name", feature_info["name"])

            default_tensor = tf.constant(
                value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"],
            )
            feature_tensor = context_features.get(feature_info["name"], default_tensor)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)

            # Preprocess features
            feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        # Define mask to identify padded sequence
        if required_fields_only and not feature_config.get_rank("serving_info")["required"]:
            """
            Define dummy mask if the rank field is not a required field for serving

            NOTE:
            This masks all max_sequence_size as 1 as there is no real way to know
            the number of sequence in the query. There is no predefined required field,
            and hence we would need to do a full pass of all features to find the record shape.
            This approach might be unstable if different features have different shapes.

            Hence we just mask all sequence
            """
            features_dict["mask"] = tf.constant(
                value=1, shape=[max_sequence_size], dtype=feature_config.get_rank("dtype")
            )
            sequence_size = tf.constant(max_sequence_size, dtype=tf.int64)
        else:
            # Typically used at training time, to pad/clip to a fixed number of sequence per query

            # Use rank as a reference tensor to infer shape/sequence_size in query
            reference_tensor = sequence_features.get(feature_config.get_rank(key="node_name"))

            # Add mask for identifying padded sequence
            mask = tf.ones_like(sparse.to_dense(sparse.reset_shape(reference_tensor)))
            sequence_size = tf.cast(tf.reduce_sum(mask), tf.int64)

            if pad_sequence:
                mask = tf.expand_dims(mask, axis=-1)

                def crop_fn():
                    tf.print("\n[WARN] Bad query found. Number of sequence : ", tf.shape(mask)[1])
                    return image.crop_to_bounding_box(
                        mask,
                        offset_height=0,
                        offset_width=0,
                        target_height=1,
                        target_width=max_sequence_size,
                    )

                mask = tf.cond(
                    tf.shape(mask)[1] <= max_sequence_size,
                    # Pad if there are missing sequence
                    lambda: image.pad_to_bounding_box(
                        mask,
                        offset_height=0,
                        offset_width=0,
                        target_height=1,
                        target_width=max_sequence_size,
                    ),
                    # Crop if there are extra sequence
                    crop_fn,
                )
                mask = tf.squeeze(mask)
            else:
                mask = tf.squeeze(mask, axis=0)

            # Check validity of mask
            tf.debugging.assert_greater(sequence_size, tf.constant(0, dtype=tf.int64))

            features_dict["mask"] = mask
            sequence_size = max_sequence_size if pad_sequence else sequence_size

        # Pad sequence features to max_sequence_size
        for feature_info in feature_config.get_sequence_features():
            feature_node_name = feature_info.get("node_name", feature_info["name"])

            default_tensor = tf.fill(
                value=tf.constant(
                    value=feature_config.get_default_value(feature_info),
                    dtype=feature_info["dtype"],
                ),
                dims=[max_sequence_size if pad_sequence else sequence_size],
            )
            feature_tensor = sequence_features.get(feature_info["name"], default_tensor)

            if isinstance(feature_tensor, sparse.SparseTensor):
                feature_tensor = sparse.reset_shape(
                    feature_tensor,
                    new_shape=[1, max_sequence_size if pad_sequence else sequence_size],
                )
                feature_tensor = sparse.to_dense(feature_tensor)
                feature_tensor = tf.squeeze(feature_tensor, axis=0)

            # Preprocess features
            feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        return features_dict, labels

    return _parse_sequence_example_fn
예제 #5
0
파일: serving.py 프로젝트: darshshah/ml4ir
def define_tfrecord_signature(
    model,
    tfrecord_type: str,
    feature_config: FeatureConfig,
    preprocessing_keys_to_fns: dict,
    postprocessing_fn=None,
    required_fields_only: bool = True,
    pad_sequence: bool = False,
    max_sequence_size: int = 0,
):
    """
    Add signatures to the tf keras savedmodel

    Returns:
        Serving signature function that accepts a TFRecord string tensor and returns predictions
    """

    # TFRecord Signature
    # Define a parsing function for tfrecord protos
    inputs = feature_config.get_all_features(key="node_name", include_label=False)

    """
    NOTE:
    Setting pad_sequence=False for tfrecord signature as it is used at inference time
    and we do NOT want to score on padded records for performance reasons

    Limitation: This limits the serving signature to only run inference on a single query
    at a time given the current implementation. This is a tricky issue to fix because
    there is no real way to generate a dense tensor of ranking scores from different queries,
    as they might have varying number of records in each of them.

    Workaround: To infer on multiple queries, run predict() on each of the queries separately.
    """

    tfrecord_parse_fn = get_parse_fn(
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        max_sequence_size=max_sequence_size,
        required_fields_only=required_fields_only,
        pad_sequence=pad_sequence,
    )

    dtype_map = dict()
    for feature_info in feature_config.get_all_features(include_label=False):
        feature_node_name = feature_info.get("node_name", feature_info["name"])
        dtype_map[feature_node_name] = feature_config.get_dtype(feature_info)

    # Define a serving signature for tfrecord
    @tf.function(input_signature=[TensorSpec(shape=[None], dtype=tf.string)])
    def _serve_tfrecord(protos):
        input_size = tf.shape(protos)[0]
        features_dict = {
            feature: TensorArray(dtype=dtype_map[feature], size=input_size) for feature in inputs
        }

        # Define loop index
        i = tf.constant(0)

        # Define loop condition
        def loop_condition(i, protos, features_dict):
            return tf.less(i, input_size)

        # Define loop body
        def loop_body(i, protos, features_dict):
            features, labels = tfrecord_parse_fn(protos[i])
            for feature, feature_val in features.items():
                features_dict[feature] = features_dict[feature].write(i, feature_val)

            i += 1

            return i, protos, features_dict

        # Parse all SequenceExample protos to get features
        _, _, features_dict = tf.while_loop(
            cond=loop_condition, body=loop_body, loop_vars=[i, protos, features_dict],
        )

        # Convert TensorArray to tensor
        features_dict = {k: v.stack() for k, v in features_dict.items()}

        # Run the model to get predictions
        predictions = model(inputs=features_dict)

        # Define a post hook
        if postprocessing_fn:
            predictions = postprocessing_fn(predictions, features_dict)

        return predictions

    return _serve_tfrecord
예제 #6
0
def make_example_parse_fn(
    feature_config: FeatureConfig,
    preprocessing_map: PreprocessingMap,
    required_fields_only: bool = False,
) -> tf.function:
    """
    Create a parse function using the Example features spec

    Args:
        feature_config: FeatureConfig object defining context and sequence features
        max_sequence_size: Maximum number of sequence per query. Used for padding.
        required_fields_only: Whether to only use required fields from the feature_config
        pad_sequence: Whether to pad sequence
    """

    features_spec = dict()

    for feature_info in feature_config.get_all_features():
        serving_info = feature_info["serving_info"]
        if not required_fields_only or serving_info.get(
                "required", feature_info["trainable"]):
            feature_name = feature_info["name"]
            dtype = feature_info["dtype"]
            default_value = feature_config.get_default_value(feature_info)
            features_spec[feature_name] = io.FixedLenFeature(
                [], dtype, default_value=default_value)
    print(features_spec)

    @tf.function
    def _parse_example_fn(example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Args:
            example_proto: tfrecord Example protobuf data

        Returns:
            features: parsed features extracted from the protobuf
            labels: parsed label extracted from the protobuf
        """
        features = io.parse_single_example(serialized=example_proto,
                                           features=features_spec)

        features_dict = dict()

        # Process all features, including label.
        for feature_info in feature_config.get_all_features():
            feature_node_name = feature_info.get("node_name",
                                                 feature_info["name"])

            default_tensor = tf.constant(
                value=feature_config.get_default_value(feature_info),
                dtype=feature_info["dtype"],
            )
            feature_tensor = features.get(feature_info["name"], default_tensor)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)

            feature_tensor = preprocess_feature(feature_tensor, feature_info,
                                                preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        return features_dict, labels

    return _parse_example_fn
예제 #7
0
def define_tfrecord_signature(
    model,
    tfrecord_type: str,
    feature_config: FeatureConfig,
    preprocessing_keys_to_fns: dict,
    postprocessing_fn=None,
    required_fields_only: bool = True,
    pad_sequence: bool = False,
    max_sequence_size: int = 0,
):
    """
    Serving signature that wraps around the keras model trained as a RelevanceModel
    with a pre-step to parse TFRecords and apply additional feature preprocessing

    Parameters
    ----------
    model : keras Model
        Keras model object to be saved
    tfrecord_type : {"example", "sequence_example"}
        Type of the TFRecord protobuf that the saved model will be used on at serving time
    feature_config : `FeatureConfig` object
        FeatureConfig object that defines the input features into the model
        and the corresponding feature preprocesing functions to be used
        in the serving signature
    preprocessing_keys_to_fns : dict
        Dictionary mapping function names to tf.functions that should be saved in the preprocessing step of the tfrecord serving signature
    postprocessing_fn: function
        custom tensorflow compatible postprocessing function to be used at serving time.
        Saved as part of the postprocessing layer of the tfrecord serving signature
    required_fields_only: bool
        boolean value defining if only required fields
        need to be added to the tfrecord parsing function at serving time
    pad_sequence: bool, optional
        Value defining if sequences should be padded for SequenceExample proto inputs at serving time.
        Set this to False if you want to not handle padded scores.
    max_sequence_size : int, optional
        Maximum sequence size for SequenceExample protobuf
        The protobuf object will be padded or clipped to this value

    Returns
    -------
    `tf.function`
        Serving signature function that accepts a TFRecord string tensor and returns predictions
    """

    # TFRecord Signature
    # Define a parsing function for tfrecord protos
    inputs = feature_config.get_all_features(key="node_name",
                                             include_label=False)
    """
    NOTE:
    Setting pad_sequence=False for tfrecord signature as it is used at inference time
    and we do NOT want to score on padded records for performance reasons

    Limitation: This limits the serving signature to only run inference on a single query
    at a time given the current implementation. This is a tricky issue to fix because
    there is no real way to generate a dense tensor of ranking scores from different queries,
    as they might have varying number of records in each of them.

    Workaround: To infer on multiple queries, run predict() on each of the queries separately.
    """

    tfrecord_parse_fn = get_parse_fn(
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        max_sequence_size=max_sequence_size,
        required_fields_only=required_fields_only,
        pad_sequence=pad_sequence,
    )

    dtype_map = dict()
    for feature_info in feature_config.get_all_features(include_label=False):
        feature_node_name = feature_info.get("node_name", feature_info["name"])
        dtype_map[feature_node_name] = feature_config.get_dtype(feature_info)

    # Define a serving signature for tfrecord
    @tf.function(input_signature=[TensorSpec(shape=[None], dtype=tf.string)])
    def _serve_tfrecord(protos):
        input_size = tf.shape(protos)[0]
        features_dict = {
            feature: TensorArray(dtype=dtype_map[feature], size=input_size)
            for feature in inputs
        }

        # Define loop index
        i = tf.constant(0)

        # Define loop condition
        def loop_condition(i, protos, features_dict):
            return tf.less(i, input_size)

        # Define loop body
        def loop_body(i, protos, features_dict):
            features, labels = tfrecord_parse_fn(protos[i])
            for feature, feature_val in features.items():
                features_dict[feature] = features_dict[feature].write(
                    i, feature_val)

            i += 1

            return i, protos, features_dict

        # Parse all SequenceExample protos to get features
        _, _, features_dict = tf.while_loop(
            cond=loop_condition,
            body=loop_body,
            loop_vars=[i, protos, features_dict],
        )

        # Convert TensorArray to tensor
        features_dict = {k: v.stack() for k, v in features_dict.items()}

        # Run the model to get predictions
        predictions = model(inputs=features_dict)

        # Define a post hook
        if postprocessing_fn:
            predictions = postprocessing_fn(predictions, features_dict)

        return predictions

    return _serve_tfrecord