Пример #1
0
def write_from_df(
    df: DataFrame,
    tfrecord_file: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    logger: Logger = None,
):
    """
    Converts data from CSV files into tfrecord files

    Parameters
    df : `pd.DataFrame`
        pandas DataFrame to be converted to TFRecordDataset
    tfrecord_file : str
        tfrecord file path to write the output
    feature_config : `FeatureConfig`
        FeatureConfig object that defines the features to be loaded in the dataset
        and the preprocessing functions to be applied to each of them
    tfrecord_type : {"example", "sequence_example"}
        Type of the TFRecord protobuf message to be used for TFRecordDataset
    logger : `Logger`, optional
        logging handler for status messages
    """

    if logger:
        logger.info(
            "Writing SequenceExample protobufs to : {}".format(tfrecord_file))
    with io.TFRecordWriter(tfrecord_file) as tf_writer:
        if tfrecord_type == TFRecordTypeKey.EXAMPLE:
            protos = df.apply(
                lambda row: get_example_proto(
                    row=row, features=feature_config.get_all_features()),
                axis=1,
            )
        elif tfrecord_type == TFRecordTypeKey.SEQUENCE_EXAMPLE:
            # Group pandas dataframe on query_id/query key and
            # convert each group to a single sequence example proto
            context_feature_names = feature_config.get_context_features(
                key="name")
            protos = df.groupby(context_feature_names).apply(
                lambda g: get_sequence_example_proto(
                    group=g,
                    context_features=feature_config.get_context_features(),
                    sequence_features=feature_config.get_sequence_features(),
                ))
        else:
            raise Exception("You have entered {} as tfrecords write mode. "
                            "We only support {} and {}.".format(
                                tfrecord_type, TFRecordTypeKey.EXAMPLE,
                                TFRecordTypeKey.SEQUENCE_EXAMPLE))
        # Write to disk
        for proto in protos:
            tf_writer.write(proto.SerializeToString())
Пример #2
0
def write_from_df(
    df: DataFrame,
    tfrecord_file: str,
    feature_config: FeatureConfig,
    tfrecord_type: str,
    logger: Logger = None,
):
    """
    Converts data from CSV files into tfrecord data.
    Output data protobuf format -> train.SequenceExample

    Args:
        df: pandas DataFrame
        tfrecord_file: tfrecord file path to write the output
        feature_config: str path to YAML feature config or str YAML feature config
        tfrecord_type: TFRecordTypeKey.EXAMPLE or TFRecordTypeKey.SEQUENCE_EXAMPLE
        logger: logging object

    NOTE: This method should be moved out of ml4ir and into the preprocessing pipeline
    """

    if logger:
        logger.info(
            "Writing SequenceExample protobufs to : {}".format(tfrecord_file))
    with io.TFRecordWriter(tfrecord_file) as tf_writer:
        if tfrecord_type == TFRecordTypeKey.EXAMPLE:
            protos = df.apply(
                lambda row: get_example_proto(
                    row=row, features=feature_config.get_all_features()),
                axis=1,
            )
        elif tfrecord_type == TFRecordTypeKey.SEQUENCE_EXAMPLE:
            # Group pandas dataframe on query_id/query key and
            # convert each group to a single sequence example proto
            context_feature_names = feature_config.get_context_features(
                key="name")
            protos = df.groupby(context_feature_names).apply(
                lambda g: get_sequence_example_proto(
                    group=g,
                    context_features=feature_config.get_context_features(),
                    sequence_features=feature_config.get_sequence_features(),
                ))
        else:
            raise Exception("You have entered {} as tfrecords write mode. "
                            "We only support {} and {}.".format(
                                tfrecord_type, TFRecordTypeKey.EXAMPLE,
                                TFRecordTypeKey.SEQUENCE_EXAMPLE))
        # Write to disk
        for proto in protos:
            tf_writer.write(proto.SerializeToString())
Пример #3
0
    def test_linear_ranking_model_save(self):
        """
        Test the save functionality of LinearRankingModel.
        Specifically, we test to see if the features and coefficients have been saved as CSV file.
        """
        feature_config_path = os.path.join(self.root_data_dir, "configs/linear_model", self.feature_config_fname)
        self.load_model_config(os.path.join(self.root_data_dir, "configs/linear_model", "model_config.yaml"))
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=["MRR"]
        )

        # Save the model and check if coefficients file was saved
        ranking_model.save(models_dir=self.args.models_dir)
        assert os.path.exists(os.path.join(self.args.models_dir, "coefficients.csv"))

        # Check coefficients for all features were saved
        coefficients_df = pd.read_csv(
            os.path.join(self.args.models_dir, "coefficients.csv"))
        train_features = set(feature_config.get_train_features("node_name"))

        assert len(train_features) == coefficients_df.shape[0]
        for train_feature in train_features:
            assert train_feature in coefficients_df.feature.values
Пример #4
0
    def get_ranking_dataset(self, data_dir: str, data_format: str,
                            feature_config_path: str):

        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        return relevance_dataset
Пример #5
0
    def __init__(self,
                 feature_config: FeatureConfig,
                 metadata_features: Dict,
                 state: str = MetricState.NEW,
                 name="MeanRankMetric",
                 dtype: Optional[dtypes.DType] = None,
                 **kwargs):
        """
        Creates a `MeanRankMetric` instance to compute mean of rank

        Parameters
        ----------
        name : str
            string name of the metric instance.
        dtype : str, optional
            data type of the metric result.
        rank : Tensor object
            2D tensor representing ranks/rankitions of records in a query
        mask : Tensor object
            2D tensor representing 0/1 mask for padded records

        Notes
        -----
        rank and mask should be same shape as y_pred and y_true

        This metric creates two local variables, `total` and `count` that are used to
        compute the frequency with which `y_pred` matches `y_true`. This frequency is
        ultimately returned as `categorical accuracy`: an idempotent operation that
        simply divides `total` by `count`.
        `y_pred` and `y_true` should be passed in as vectors of probabilities, rather
        than as labels. If necessary, use `tf.one_hot` to expand `y_true` as a vector.
        If `sample_weight` is `None`, weights default to 1.
        Use `sample_weight` of 0 to mask values.
        """
        name = "{}_{}".format(state, name)
        # TODO: Handle Example dataset without mask and rank fields
        rank = tf.squeeze(
            metadata_features[feature_config.get_rank("node_name")], axis=-1)
        mask = tf.squeeze(
            metadata_features[feature_config.get_mask("node_name")], axis=-1)

        super(MeanRankMetric, self).__init__(self._compute,
                                             name,
                                             dtype=dtype,
                                             rank=rank,
                                             mask=mask)
        self.state = state
Пример #6
0
    def get_feature_config(self):
        feature_config_path = os.path.join(self.root_data_dir, "config", self.feature_config_fname)

        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        return feature_config
Пример #7
0
    def run_default_pipeline(self, data_dir: str, data_format: str,
                             feature_config_path: str):
        """Train a model with the default set of args"""
        metrics_keys = ["MRR"]

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(123)
        tf.random.set_seed(123)
        random.seed(123)

        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        ranking_model.fit(dataset=relevance_dataset,
                          num_epochs=1,
                          models_dir=self.output_dir)

        loss = dict(
            zip(
                ranking_model.model.metrics_names,
                ranking_model.model.evaluate(relevance_dataset.test),
            ))["loss"]
        new_MRR = ranking_model.evaluate(
            test_dataset=relevance_dataset.test,
            logs_dir=self.args.logs_dir,
        )[0]["new_MRR"]

        return loss, new_MRR
Пример #8
0
    def test_drop_out_layers(self):
        feature_config = FeatureConfig(
            yaml.safe_load('''
            query_key:
              name: query_key
              node_name: query_key
              trainable: false
              dtype: string
            label:
              name: entity_id
              feature_layer_info:
                type: numeric
                fn: categorical_indicator_with_vocabulary_file
                args:
                  vocabulary_file: ml4ir/applications/classification/tests/data/configs/vocabulary/entity_id.csv
            features:
              - name: query_text
                trainable: false
                dtype: string
        '''))

        model_info = yaml.safe_load('''
            architecture_key: dnn
            layers:
              - type: dense
                name: first_dense
                units: 256
                activation: relu
              - type: dropout
                name: first_dropout
                rate: 0.3
              - type: dense
                name: second_dense
                units: 64
                activation: relu
              - type: dropout
                name: second_dropout
                rate: 0.0
              - type: dense
                name: final_dense
                activation: null
        ''')

        dnn = DNN(model_info, feature_config, self.file_io)
        assert (len(dnn.layer_ops) == 5)
        assert (dnn.layer_ops[0].get_config()['units'] == 256)
        assert (dnn.layer_ops[1].get_config()['rate'] == 0.3)
        assert (dnn.layer_ops[2].get_config()['units'] == 64)
        assert (dnn.layer_ops[3].get_config()['rate'] == 0.0)
        assert (dnn.layer_ops[4].get_config()['units'] == 9)
Пример #9
0
    def get_ranking_dataset_and_model(self,
                                      seed=123,
                                      initialize_layers_dict={},
                                      freeze_layers_list=[]):
        """Helper method to get a RankingModel and Dataset with some default args"""
        data_dir = os.path.join(self.root_data_dir, DataFormatKey.TFRECORD)
        feature_config_path = os.path.join(self.root_data_dir, "configs",
                                           self.feature_config_fname)
        data_format = DataFormatKey.TFRECORD
        metrics_keys = [MetricKey.MRR]

        # Fix random seed values for repeatability
        tf.keras.backend.clear_session()
        np.random.seed(seed)
        tf.random.set_seed(seed)
        random.seed(seed)

        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys,
            initialize_layers_dict=initialize_layers_dict,
            freeze_layers_list=freeze_layers_list,
        )

        return ranking_model, relevance_dataset
Пример #10
0
    def setUp(self):
        file_io = LocalIO()
        logger = logging.getLogger()

        self.dataset = tf.data.TFRecordDataset(DATASET_PATH)
        self.proto = next(iter(self.dataset))
        self.feature_config = FeatureConfig.get_instance(
            tfrecord_type=TFRecordTypeKey.EXAMPLE,
            feature_config_dict=file_io.read_yaml(FEATURE_CONFIG_PATH),
            logger=logger,
        )
        self.parser = TFRecordExampleParser(
            feature_config=self.feature_config,
            preprocessing_map=PreprocessingMap(),
            required_fields_only=False,
        )
Пример #11
0
def main(args):
    """Convert CSV files into tfrecord Example/SequenceExample files"""
    # Setup logging
    logger: Logger = setup_logging()
    file_io = LocalIO(logger)

    # Get all CSV files to be converted, depending on user's arguments
    if args.csv_dir:
        csv_files: List[str] = file_io.get_files_in_directory(
            indir=args.csv_dir, extension="*.csv")
    else:
        csv_files: List[str] = args.csv_files

    # Load feat config

    feature_config: FeatureConfig = FeatureConfig.get_instance(
        tfrecord_type=MODES[args.tfmode],
        feature_config_dict=file_io.read_yaml(args.feature_config),
        logger=logger,
    )

    # Convert to TFRecord SequenceExample protobufs and save
    if args.keep_single_files:
        # Convert each CSV file individually - better performance
        for csv_file in csv_files:
            tfrecord_file: str = os.path.basename(csv_file).replace(".csv", "")
            tfrecord_file: str = os.path.join(
                args.out_dir, "{}.tfrecord".format(tfrecord_file))
            write_from_files(
                csv_files=[csv_file],
                tfrecord_file=tfrecord_file,
                feature_config=feature_config,
                logger=logger,
                tfrecord_type=MODES[args.tfmode],
            )

    else:
        # Convert all CSV files at once - expensive groupby operation
        tfrecord_file: str = os.path.join(args.out_dir, "combined.tfrecord")
        write_from_files(
            csv_files=csv_files,
            tfrecord_file=tfrecord_file,
            feature_config=feature_config,
            logger=logger,
            tfrecord_type=MODES[args.tfmode],
            file_io=file_io,
        )
Пример #12
0
    def run_default_pipeline(self, data_dir: str, data_format: str,
                             feature_config_path: str):
        """Train a model with the default set of args"""
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )
        data_dir = os.path.join(self.root_data_dir, "tfrecord")
        data_format = "tfrecord"

        metrics_keys = ["categorical_accuracy", "MRR", "ACR"]

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=self.args.loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        overall_metrics, _ = ranking_model.evaluate(
            test_dataset=relevance_dataset.test,
            logs_dir=self.args.logs_dir,
        )

        return overall_metrics.to_dict()
Пример #13
0
    def run_default_pipeline(self, loss_key: str):
        """Train a model with the default set of args"""
        feature_config_path = os.path.join(self.root_data_dir, "configs",
                                           self.feature_config_fname)
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=self.args.tfrecord_type,
            feature_config_dict=self.file_io.read_yaml(feature_config_path),
            logger=self.logger,
        )
        data_dir = os.path.join(self.root_data_dir, "tfrecord")
        data_format = "tfrecord"

        metrics_keys = ["MRR"]

        relevance_dataset = RelevanceDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            tfrecord_type=self.args.tfrecord_type,
            max_sequence_size=self.args.max_sequence_size,
            batch_size=self.args.batch_size,
            preprocessing_keys_to_fns={},
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            use_part_files=self.args.use_part_files,
            parse_tfrecord=True,
            file_io=self.file_io,
            logger=self.logger,
        )

        ranking_model: RankingModel = self.get_ranking_model(
            loss_key=loss_key,
            feature_config=feature_config,
            metrics_keys=metrics_keys)

        metrics = ranking_model.model.evaluate(relevance_dataset.test)
        return dict(zip(ranking_model.model.metrics_names, metrics))["loss"]
Пример #14
0
    def __init__(self, args: Namespace):
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS,
                                               self.run_id)
            self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA,
                                               os.path.basename(self.data_dir))
        else:
            self.models_dir_local = os.path.join(self.args.models_dir,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(
            json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)
        self.model_file = self.args.model_file

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(
                dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir,
                                        DefaultDirectoryKey.TEMP_DATA)

            # Copy model_file if present from HDFS to local file system
            if self.model_file:
                self.local_io.make_directory(
                    dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True)
                self.file_io.copy_from_hdfs(self.model_file,
                                            DefaultDirectoryKey.TEMP_MODELS)
                self.model_file = os.path.join(
                    DefaultDirectoryKey.TEMP_MODELS,
                    os.path.basename(self.model_file))

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        self.optimizer_key: str = self.args.optimizer_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(
                self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=self.file_io.read_yaml(
                self.args.feature_config),
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")
Пример #15
0
def define_tfrecord_signature(
    model,
    tfrecord_type: str,
    feature_config: FeatureConfig,
    preprocessing_keys_to_fns: dict,
    postprocessing_fn=None,
    required_fields_only: bool = True,
    pad_sequence: bool = False,
    max_sequence_size: int = 0,
):
    """
    Add signatures to the tf keras savedmodel

    Returns:
        Serving signature function that accepts a TFRecord string tensor and returns predictions
    """

    # TFRecord Signature
    # Define a parsing function for tfrecord protos
    inputs = feature_config.get_all_features(key="node_name", include_label=False)

    """
    NOTE:
    Setting pad_sequence=False for tfrecord signature as it is used at inference time
    and we do NOT want to score on padded records for performance reasons

    Limitation: This limits the serving signature to only run inference on a single query
    at a time given the current implementation. This is a tricky issue to fix because
    there is no real way to generate a dense tensor of ranking scores from different queries,
    as they might have varying number of records in each of them.

    Workaround: To infer on multiple queries, run predict() on each of the queries separately.
    """

    tfrecord_parse_fn = get_parse_fn(
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        max_sequence_size=max_sequence_size,
        required_fields_only=required_fields_only,
        pad_sequence=pad_sequence,
    )

    dtype_map = dict()
    for feature_info in feature_config.get_all_features(include_label=False):
        feature_node_name = feature_info.get("node_name", feature_info["name"])
        dtype_map[feature_node_name] = feature_config.get_dtype(feature_info)

    # Define a serving signature for tfrecord
    @tf.function(input_signature=[TensorSpec(shape=[None], dtype=tf.string)])
    def _serve_tfrecord(protos):
        input_size = tf.shape(protos)[0]
        features_dict = {
            feature: TensorArray(dtype=dtype_map[feature], size=input_size) for feature in inputs
        }

        # Define loop index
        i = tf.constant(0)

        # Define loop condition
        def loop_condition(i, protos, features_dict):
            return tf.less(i, input_size)

        # Define loop body
        def loop_body(i, protos, features_dict):
            features, labels = tfrecord_parse_fn(protos[i])
            for feature, feature_val in features.items():
                features_dict[feature] = features_dict[feature].write(i, feature_val)

            i += 1

            return i, protos, features_dict

        # Parse all SequenceExample protos to get features
        _, _, features_dict = tf.while_loop(
            cond=loop_condition, body=loop_body, loop_vars=[i, protos, features_dict],
        )

        # Convert TensorArray to tensor
        features_dict = {k: v.stack() for k, v in features_dict.items()}

        # Run the model to get predictions
        predictions = model(inputs=features_dict)

        # Define a post hook
        if postprocessing_fn:
            predictions = postprocessing_fn(predictions, features_dict)

        return predictions

    return _serve_tfrecord
Пример #16
0
    def __init__(self, args: Namespace):
        """
        Constructor to create a RelevancePipeline object to train, evaluate
        and save a model on ml4ir.
        This method sets up data, logs, models directories, file handlers used.
        The method also loads and sets up the FeatureConfig for the model training
        pipeline

        Parameters
        ----------
        args: argparse Namespace
            arguments to be used with the pipeline.
            Typically, passed from command line arguments
        """
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        # Setup directories
        self.local_io = LocalIO()
        self.models_dir_hdfs = None
        self.logs_dir_hdfs = None
        self.data_dir_hdfs = None
        if self.args.file_handler == FileHandlerKey.SPARK:
            self.models_dir = os.path.join(self.args.models_dir, self.run_id)
            self.logs_dir = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir = self.args.data_dir

            self.models_dir_local = os.path.join(DefaultDirectoryKey.MODELS,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(DefaultDirectoryKey.LOGS,
                                               self.run_id)
            self.data_dir_local = os.path.join(DefaultDirectoryKey.TEMP_DATA,
                                               os.path.basename(self.data_dir))
        else:
            self.models_dir_local = os.path.join(self.args.models_dir,
                                                 self.run_id)
            self.logs_dir_local = os.path.join(self.args.logs_dir, self.run_id)
            self.data_dir_local = self.args.data_dir

        # Setup logging
        self.local_io.make_directory(self.logs_dir_local, clear_dir=True)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir_local))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.debug("CLI args: \n{}".format(
            json.dumps(vars(self.args), indent=4)))
        self.local_io.set_logger(self.logger)
        self.local_io.make_directory(self.models_dir_local, clear_dir=False)
        self.model_file = self.args.model_file

        # Set the file handlers and respective setup
        if self.args.file_handler == FileHandlerKey.LOCAL:
            self.file_io = self.local_io
        elif self.args.file_handler == FileHandlerKey.SPARK:
            self.file_io = SparkIO(self.logger)

            # Copy data dir from HDFS to local file system
            self.local_io.make_directory(
                dir_path=DefaultDirectoryKey.TEMP_DATA, clear_dir=True)
            self.file_io.copy_from_hdfs(self.data_dir,
                                        DefaultDirectoryKey.TEMP_DATA)

            # Copy model_file if present from HDFS to local file system
            if self.model_file:
                self.local_io.make_directory(
                    dir_path=DefaultDirectoryKey.TEMP_MODELS, clear_dir=True)
                self.file_io.copy_from_hdfs(self.model_file,
                                            DefaultDirectoryKey.TEMP_MODELS)
                self.model_file = os.path.join(
                    DefaultDirectoryKey.TEMP_MODELS,
                    os.path.basename(self.model_file))

        # Read/Parse model config YAML
        self.model_config_file = self.args.model_config

        # Setup other arguments
        self.loss_key: str = self.args.loss_key
        if self.args.metrics_keys[0] == "[":
            self.metrics_keys: List[str] = ast.literal_eval(
                self.args.metrics_keys)
        else:
            self.metrics_keys = [self.args.metrics_keys]
        self.data_format: str = self.args.data_format
        self.tfrecord_type: str = self.args.tfrecord_type

        if args.data_format == DataFormatKey.RANKLIB:
            try:
                self.non_zero_features_only = self.args.non_zero_features_only
                self.keep_additional_info = self.args.keep_additional_info
            except KeyError:
                self.non_zero_features_only = 0
                self.keep_additional_info = 0
        else:
            self.non_zero_features_only = 0
            self.keep_additional_info = 0

        if args.model_file:
            self.model_file = args.model_file
        else:
            self.model_file = None

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = FeatureConfig.get_instance(
            feature_config_dict=self.file_io.read_yaml(
                self.args.feature_config),
            tfrecord_type=self.tfrecord_type,
            logger=self.logger,
        )

        # Finished initialization
        self.logger.info("Relevance Pipeline successfully initialized!")
Пример #17
0
    def __init__(
        self,
        feature_config: FeatureConfig,
        tfrecord_type: str,
        file_io: FileIO,
        scorer: Optional[ScorerBase] = None,
        metrics: List[Union[Type[kmetrics.Metric], str]] = [],
        optimizer: Optional[Optimizer] = None,
        model_file: Optional[str] = None,
        compile_keras_model: bool = False,
        output_name: str = "score",
        logger=None,
    ):
        """Use this constructor to define a custom scorer"""
        self.feature_config: FeatureConfig = feature_config
        self.logger: Logger = logger
        self.output_name = output_name
        self.scorer = scorer
        self.tfrecord_type = tfrecord_type
        self.file_io = file_io

        if scorer:
            self.max_sequence_size = scorer.interaction_model.max_sequence_size
        else:
            self.max_sequence_size = 0

        # Load/Build Model
        if model_file and not compile_keras_model:
            """
            If a model file is specified, load it without compiling into a keras model

            NOTE:
            This will allow the model to be only used for inference and
            cannot be used for retraining.
            """
            self.model: Model = self.load(model_file)
            self.is_compiled = False
        else:

            """
            Specify inputs to the model

            Individual input nodes are defined for each feature
            Each data point represents features for all records in a single query
            """
            inputs: Dict[str, Input] = feature_config.define_inputs()
            scores, train_features, metadata_features = scorer(inputs)

            # Create model with functional Keras API
            self.model = Model(inputs=inputs, outputs={self.output_name: scores})

            # Get loss fn
            loss_fn = scorer.loss.get_loss_fn(**metadata_features)

            # Get metric objects
            metrics_impl: List[Union[str, kmetrics.Metric]] = get_metrics_impl(
                metrics=metrics, feature_config=feature_config, metadata_features=metadata_features
            )

            # Compile model
            """
            NOTE:
            Related Github issue: https://github.com/tensorflow/probability/issues/519
            """
            self.model.compile(
                optimizer=optimizer,
                loss=loss_fn,
                metrics=metrics_impl,
                experimental_run_tf_function=False,
            )

            # Write model summary to logs
            model_summary = list()
            self.model.summary(print_fn=lambda x: model_summary.append(x))
            self.logger.info("\n".join(model_summary))

            if model_file:
                """
                If model file is specified, load the weights from the SavedModel

                NOTE:
                The architecture, loss and metrics of self.model need to
                be the same as the loaded SavedModel
                """
                self.load_weights(model_file)

            self.is_compiled = True
Пример #18
0
def make_sequence_example_parse_fn(
    feature_config: FeatureConfig,
    preprocessing_map: PreprocessingMap,
    max_sequence_size: int = 25,
    required_fields_only: bool = False,
    pad_sequence: bool = True,
) -> tf.function:
    """
    Create a parse function using the SequenceExample features spec

    Parameters
    ----------
    feature_config : `FeatureConfig`
        FeatureConfig object defining context and sequence feature information
    preprocessing_map : int
        map of preprocessing feature functions
    max_sequence_size : int
        Maximum number of sequence per query. Used for padding
    required_fields_only : bool, optional
        Whether to only use required fields from the feature_config
    pad_sequence : bool
        Whether to pad sequence

    Returns
    -------
    `tf.function`
        Parsing function that takes in a serialized SequenceExample message
        and extracts a feature dictionary for context and sequence features
    """

    context_features_spec = dict()
    sequence_features_spec = dict()

    for feature_info in feature_config.get_all_features():
        serving_info = feature_info["serving_info"]
        if not required_fields_only or serving_info.get("required", feature_info["trainable"]):
            feature_name = feature_info["name"]
            dtype = feature_info["dtype"]
            default_value = feature_config.get_default_value(feature_info)
            if feature_info["tfrecord_type"] == SequenceExampleTypeKey.CONTEXT:
                context_features_spec[feature_name] = io.FixedLenFeature(
                    [], dtype, default_value=default_value
                )
            elif feature_info["tfrecord_type"] == SequenceExampleTypeKey.SEQUENCE:
                sequence_features_spec[feature_name] = io.VarLenFeature(dtype=dtype)

    @tf.function
    def _parse_sequence_example_fn(sequence_example_proto):
        """
        Parse the input `tf.SequenceExample` proto using the features_spec

        Parameters
        ----------
        sequence_example_proto : string
            serialized tfrecord SequenceExample protobuf message

        Returns
        -------
        features : dict
            parsed features as `tf.Tensor` objects extracted from the protobuf
        labels : `tf.Tensor`
            parsed label as a `tf.Tensor` object extracted from the protobuf
        """
        context_features, sequence_features = io.parse_single_sequence_example(
            serialized=sequence_example_proto,
            context_features=context_features_spec,
            sequence_features=sequence_features_spec,
        )

        features_dict = dict()

        # Handle context features
        for feature_info in feature_config.get_context_features():
            feature_node_name = feature_info.get("node_name", feature_info["name"])

            default_tensor = tf.constant(
                value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"],
            )
            feature_tensor = context_features.get(feature_info["name"], default_tensor)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)

            # Preprocess features
            feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        # Define mask to identify padded sequence
        if required_fields_only and not feature_config.get_rank("serving_info")["required"]:
            """
            Define dummy mask if the rank field is not a required field for serving

            NOTE:
            This masks all max_sequence_size as 1 as there is no real way to know
            the number of sequence in the query. There is no predefined required field,
            and hence we would need to do a full pass of all features to find the record shape.
            This approach might be unstable if different features have different shapes.

            Hence we just mask all sequence
            """
            features_dict["mask"] = tf.constant(
                value=1, shape=[max_sequence_size], dtype=feature_config.get_rank("dtype")
            )
            sequence_size = tf.constant(max_sequence_size, dtype=tf.int64)
        else:
            # Typically used at training time, to pad/clip to a fixed number of sequence per query

            # Use rank as a reference tensor to infer shape/sequence_size in query
            reference_tensor = sequence_features.get(feature_config.get_rank(key="node_name"))

            # Add mask for identifying padded sequence
            mask = tf.ones_like(sparse.to_dense(sparse.reset_shape(reference_tensor)))
            sequence_size = tf.cast(tf.reduce_sum(mask), tf.int64)

            if pad_sequence:
                mask = tf.expand_dims(mask, axis=-1)

                def crop_fn():
                    tf.print("\n[WARN] Bad query found. Number of sequence : ", tf.shape(mask)[1])
                    return image.crop_to_bounding_box(
                        mask,
                        offset_height=0,
                        offset_width=0,
                        target_height=1,
                        target_width=max_sequence_size,
                    )

                mask = tf.cond(
                    tf.shape(mask)[1] <= max_sequence_size,
                    # Pad if there are missing sequence
                    lambda: image.pad_to_bounding_box(
                        mask,
                        offset_height=0,
                        offset_width=0,
                        target_height=1,
                        target_width=max_sequence_size,
                    ),
                    # Crop if there are extra sequence
                    crop_fn,
                )
                mask = tf.squeeze(mask)
            else:
                mask = tf.squeeze(mask, axis=0)

            # Check validity of mask
            tf.debugging.assert_greater(sequence_size, tf.constant(0, dtype=tf.int64))

            features_dict["mask"] = mask
            sequence_size = max_sequence_size if pad_sequence else sequence_size

        # Pad sequence features to max_sequence_size
        for feature_info in feature_config.get_sequence_features():
            feature_node_name = feature_info.get("node_name", feature_info["name"])

            default_tensor = tf.fill(
                value=tf.constant(
                    value=feature_config.get_default_value(feature_info),
                    dtype=feature_info["dtype"],
                ),
                dims=[max_sequence_size if pad_sequence else sequence_size],
            )
            feature_tensor = sequence_features.get(feature_info["name"], default_tensor)

            if isinstance(feature_tensor, sparse.SparseTensor):
                feature_tensor = sparse.reset_shape(
                    feature_tensor,
                    new_shape=[1, max_sequence_size if pad_sequence else sequence_size],
                )
                feature_tensor = sparse.to_dense(feature_tensor)
                feature_tensor = tf.squeeze(feature_tensor, axis=0)

            # Preprocess features
            feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        return features_dict, labels

    return _parse_sequence_example_fn
Пример #19
0
def read(data_dir: str,
         feature_config: FeatureConfig,
         tfrecord_type: str,
         tfrecord_dir: str,
         file_io: FileIO,
         batch_size: int = 128,
         preprocessing_keys_to_fns: dict = {},
         use_part_files: bool = False,
         max_sequence_size: int = 25,
         parse_tfrecord: bool = True,
         logger=None,
         keep_additional_info=0,
         non_zero_features_only=1,
         **kwargs) -> tf.data.TFRecordDataset:
    """
    - reads ranklib-formatted data from an input directory
    - selects relevant features
    - creates Dataset X and y

    Current execution plan:
        1. Convert ranklib to a dataframe
        2. Convert each query into tf.train.SequenceExample protobufs
        3. Write the protobufs into a .tfrecord file
        4. Load .tfrecord file into a TFRecordDataset and parse the protobufs

        Parameters
        ----------
        data_dir: str
            Path to directory containing csv files to read
        feature_config: ml4ir.config.features.FeatureConfig object
            FeatureConfig object extracted from the feature config
        tfrecord_dir: str
            Path to directory where the serialized .tfrecord files will be stored
        batch_size: int
            Value specifying the size of the batch
        use_part_files: bool
            Value specifying whether to look for part files
        max_sequence_size: int
            Value specifying max number of records per query
        logger: logging object
            logging object
        keep_additional_info: int
            Option to keep additional info (All info after the "#") 1 to keep, 0 to ignore
        non_zero_features_only: int
            Only non zero features are stored. 1 for yes, 0 otherwise

        Returns
        -------
        tensorflow TFRecordDataset
            Processed dataset
    """
    ranklib_files: List[str] = file_io.get_files_in_directory(
        data_dir,
        extension="" if use_part_files else ".txt",
        prefix="part-" if use_part_files else "",
    )

    gl_2_clicks = False

    # Create a directory for storing tfrecord files
    file_io.make_directory(tfrecord_dir, clear_dir=True)

    #Convert input ranklib file to dataframe
    df = pd.concat([
        ranklib_helper.convert(f, keep_additional_info, gl_2_clicks,
                               non_zero_features_only,
                               feature_config.get_query_key()['name'],
                               feature_config.get_label()['name'])
        for f in ranklib_files
    ])

    #Write tfrecord files
    tfrecord_writer.write_from_df(df=df,
                                  tfrecord_file=os.path.join(
                                      tfrecord_dir, TFRECORD_FILE),
                                  feature_config=feature_config,
                                  tfrecord_type=tfrecord_type,
                                  logger=logger)

    dataset = tfrecord_reader.read(
        data_dir=tfrecord_dir,
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        max_sequence_size=max_sequence_size,
        batch_size=batch_size,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        parse_tfrecord=parse_tfrecord,
        file_io=file_io,
        logger=logger,
    )

    return dataset
Пример #20
0
def run_dataset_creation(
    data_dir: str = DATA_DIR,
    out_dir: str = OUT_DIR,
    feature_config_path: str = FEATURE_CONFIG,
    feature_highval: dict = FEATURE_HIGHVAL,
    feature_num_results: str = FEATURE_NUM_RESULTS,
    max_num_records: int = MAX_NUM_RECORDS,
    num_samples: int = NUM_SAMPLES,
    random_state: int = RANDOM_STATE,
):
    """
    1. Loads example data
    2. Builds specified synthetic data size by sampling from example data
    3. Adds catastrophic failures specifically
    4. For now, write out to CSV. In future could return df directly
    """
    # Setup logging
    file_io = LocalIO()
    logger: Logger = setup_logging(file_io)
    file_io.set_logger(logger)

    try:
        # Set seeds
        set_seeds(random_state)
        logger.info(
            "Set seeds with initial random state {}".format(random_state))

        # Load and parse feature config
        feature_config: FeatureConfig = FeatureConfig.get_instance(
            tfrecord_type=TFRecordTypeKey.SEQUENCE_EXAMPLE,
            feature_config_dict=file_io.read_yaml(feature_config_path),
            logger=logger,
        )
        logger.info("Feature config parsed and loaded")

        # Create output location
        file_io.make_directory(out_dir)
        out_file = os.path.join(
            out_dir, "synthetic_data_{}.csv".format(
                dt.datetime.now().strftime("%Y%m%d-%H%M%S")))

        # Build data
        seed_data = load_seed_data(data_dir, logger, file_io)

        df_synthetic = fill_data(
            seed_data,
            max_num_records,
            feature_config,
            feature_highval,
            feature_num_results,
            num_samples,
            logger,
        )
        file_io.write_df(df_synthetic, outfile=out_file, index=False)
        logger.info("Synthetic data created! Location: {}".format(out_file))
        return df_synthetic

    except Exception as e:
        logger.error("!!! Error creating synthetic data: !!!\n{}".format(
            str(e)))
        traceback.print_exc()
        return
Пример #21
0
def make_example_parse_fn(
    feature_config: FeatureConfig,
    preprocessing_map: PreprocessingMap,
    required_fields_only: bool = False,
) -> tf.function:
    """
    Create a parse function using the Example features spec

    Args:
        feature_config: FeatureConfig object defining context and sequence features
        max_sequence_size: Maximum number of sequence per query. Used for padding.
        required_fields_only: Whether to only use required fields from the feature_config
        pad_sequence: Whether to pad sequence
    """

    features_spec = dict()

    for feature_info in feature_config.get_all_features():
        serving_info = feature_info["serving_info"]
        if not required_fields_only or serving_info.get(
                "required", feature_info["trainable"]):
            feature_name = feature_info["name"]
            dtype = feature_info["dtype"]
            default_value = feature_config.get_default_value(feature_info)
            features_spec[feature_name] = io.FixedLenFeature(
                [], dtype, default_value=default_value)
    print(features_spec)

    @tf.function
    def _parse_example_fn(example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Args:
            example_proto: tfrecord Example protobuf data

        Returns:
            features: parsed features extracted from the protobuf
            labels: parsed label extracted from the protobuf
        """
        features = io.parse_single_example(serialized=example_proto,
                                           features=features_spec)

        features_dict = dict()

        # Process all features, including label.
        for feature_info in feature_config.get_all_features():
            feature_node_name = feature_info.get("node_name",
                                                 feature_info["name"])

            default_tensor = tf.constant(
                value=feature_config.get_default_value(feature_info),
                dtype=feature_info["dtype"],
            )
            feature_tensor = features.get(feature_info["name"], default_tensor)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)

            feature_tensor = preprocess_feature(feature_tensor, feature_info,
                                                preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        return features_dict, labels

    return _parse_example_fn
Пример #22
0
    def __init__(
        self,
        feature_config: FeatureConfig,
        tfrecord_type: str,
        file_io: FileIO,
        scorer: Optional[ScorerBase] = None,
        metrics: List[Union[Type[kmetrics.Metric], str]] = [],
        optimizer: Optional[Optimizer] = None,
        model_file: Optional[str] = None,
        initialize_layers_dict: dict = {},
        freeze_layers_list: list = [],
        compile_keras_model: bool = False,
        output_name: str = "score",
        logger=None,
    ):
        """
        Constructor to instantiate a RelevanceModel that can be used for
        training and evaluating the search ML task

        Parameters
        ----------
        feature_config : `FeatureConfig` object
            FeatureConfig object that defines the features to be loaded in the dataset
            and the preprocessing functions to be applied to each of them
        tfrecord_type : {"example", "sequence_example"}
            Type of the TFRecord protobuf message used for TFRecordDataset
        file_io : `FileIO` object
            file I/O handler objects for reading and writing data
        scorer : `ScorerBase` object
            Scorer object that wraps an InteractionModel and converts
            input features into scores
        metrics : list
            List of keras Metric classes that will be used for evaluating the trained model
        optimizer : `Optimizer`
            Tensorflow keras optimizer to be used for training the model
        model_file : str, optional
            Path to pretrained model file to be loaded for evaluation or retraining
        initialize_layers_dict : dict, optional
            Dictionary of tensorflow layer names mapped to the path of pretrained weights
            Use this for transfer learning with pretrained weights
        freeze_layers_list : list, optional
            List of model layer names to be frozen
            Use this for freezing pretrained weights from other ml4ir models
        compile_keras_model : bool, optional
            Whether the keras model loaded from disk should be compiled
            with loss, metrics and an optimizer
        output_name : str, optional
            Name of the output tensorflow node that captures the score
        logger : `Logger`, optional
            logging handler for status messages
        """
        self.feature_config: FeatureConfig = feature_config
        self.logger: Logger = logger
        self.output_name = output_name
        self.scorer = scorer
        self.tfrecord_type = tfrecord_type
        self.file_io = file_io

        if scorer:
            self.max_sequence_size = scorer.interaction_model.max_sequence_size
        else:
            self.max_sequence_size = 0

        # Load/Build Model
        if model_file and not compile_keras_model:
            """
            If a model file is specified, load it without compiling into a keras model

            NOTE:
            This will allow the model to be only used for inference and
            cannot be used for retraining.
            """
            self.model: Model = self.load(model_file)
            self.is_compiled = False
        else:
            """
            Specify inputs to the model

            Individual input nodes are defined for each feature
            Each data point represents features for all records in a single query
            """
            inputs: Dict[str, Input] = feature_config.define_inputs()
            scores, train_features, metadata_features = scorer(inputs)

            # Create model with functional Keras API
            self.model = Model(inputs=inputs,
                               outputs={self.output_name: scores})
            self.model.output_names = [self.output_name]

            # Get loss fn
            loss_fn = scorer.loss.get_loss_fn(**metadata_features)

            # Get metric objects
            metrics_impl: List[Union[str, kmetrics.Metric]] = get_metrics_impl(
                metrics=metrics,
                feature_config=feature_config,
                metadata_features=metadata_features)

            # Compile model
            """
            NOTE:
            Related Github issue: https://github.com/tensorflow/probability/issues/519
            """
            self.model.compile(
                optimizer=optimizer,
                loss=loss_fn,
                metrics=metrics_impl,
                experimental_run_tf_function=False,
            )

            # Write model summary to logs
            model_summary = list()
            self.model.summary(print_fn=lambda x: model_summary.append(x))
            if self.logger:
                self.logger.info("\n".join(model_summary))

            if model_file:
                """
                If model file is specified, load the weights from the SavedModel

                NOTE:
                The architecture, loss and metrics of self.model need to
                be the same as the loaded SavedModel
                """
                self.load_weights(model_file)

            # Initialize layer weights
            for layer_name, layer_file in initialize_layers_dict.items():
                layer = self.model.get_layer(layer_name)
                layer.set_weights(
                    self.file_io.load_numpy_array(layer_file, unzip=True))
                self.logger.info("Setting {} weights from {}".format(
                    layer_name, layer_file))

            # Freeze layer weights
            for layer_name in freeze_layers_list:
                layer = self.model.get_layer(layer_name)
                layer.trainable = False
                self.logger.info("Freezing {} layer".format(layer_name))

            self.is_compiled = True
Пример #23
0
def make_example_parse_fn(
    feature_config: FeatureConfig,
    preprocessing_map: PreprocessingMap,
    required_fields_only: bool = False,
) -> tf.function:
    """
    Create a parse function using the Example features spec

    Parameters
    ----------
    feature_config : `FeatureConfig`
        FeatureConfig object defining context and sequence feature information
    preprocessing_map : `PreprocessingMap` object
        map of preprocessing feature functions
    required_fields_only : bool, optional
        Whether to only use required fields from the feature_config

    Returns
    -------
    `tf.function`
        Parsing function that takes in a serialized Example message and extracts a feature dictionary
    """

    features_spec = dict()

    for feature_info in feature_config.get_all_features():
        serving_info = feature_info["serving_info"]
        if not required_fields_only or serving_info.get("required", feature_info["trainable"]):
            feature_name = feature_info["name"]
            dtype = feature_info["dtype"]
            default_value = feature_config.get_default_value(feature_info)
            features_spec[feature_name] = io.FixedLenFeature(
                [], dtype, default_value=default_value
            )
    print(features_spec)

    @tf.function
    def _parse_example_fn(example_proto):
        """
        Parse the input `tf.Example` proto using the features_spec

        Parameters
        ----------
        example_proto : string
            serialized tfrecord Example protobuf message

        Returns
        -------
        features : dict
            parsed features as `tf.Tensor` objects extracted from the protobuf
        labels : `tf.Tensor`
            parsed label as a `tf.Tensor` object extracted from the protobuf
        """
        features = io.parse_single_example(serialized=example_proto, features=features_spec)

        features_dict = dict()

        # Process all features, including label.
        for feature_info in feature_config.get_all_features():
            feature_node_name = feature_info.get("node_name", feature_info["name"])

            default_tensor = tf.constant(
                value=feature_config.get_default_value(feature_info), dtype=feature_info["dtype"],
            )
            feature_tensor = features.get(feature_info["name"], default_tensor)

            feature_tensor = tf.expand_dims(feature_tensor, axis=0)

            feature_tensor = preprocess_feature(feature_tensor, feature_info, preprocessing_map)

            features_dict[feature_node_name] = feature_tensor

        labels = features_dict.pop(feature_config.get_label(key="name"))

        return features_dict, labels

    return _parse_example_fn
Пример #24
0
def define_tfrecord_signature(
    model,
    tfrecord_type: str,
    feature_config: FeatureConfig,
    preprocessing_keys_to_fns: dict,
    postprocessing_fn=None,
    required_fields_only: bool = True,
    pad_sequence: bool = False,
    max_sequence_size: int = 0,
):
    """
    Serving signature that wraps around the keras model trained as a RelevanceModel
    with a pre-step to parse TFRecords and apply additional feature preprocessing

    Parameters
    ----------
    model : keras Model
        Keras model object to be saved
    tfrecord_type : {"example", "sequence_example"}
        Type of the TFRecord protobuf that the saved model will be used on at serving time
    feature_config : `FeatureConfig` object
        FeatureConfig object that defines the input features into the model
        and the corresponding feature preprocesing functions to be used
        in the serving signature
    preprocessing_keys_to_fns : dict
        Dictionary mapping function names to tf.functions that should be saved in the preprocessing step of the tfrecord serving signature
    postprocessing_fn: function
        custom tensorflow compatible postprocessing function to be used at serving time.
        Saved as part of the postprocessing layer of the tfrecord serving signature
    required_fields_only: bool
        boolean value defining if only required fields
        need to be added to the tfrecord parsing function at serving time
    pad_sequence: bool, optional
        Value defining if sequences should be padded for SequenceExample proto inputs at serving time.
        Set this to False if you want to not handle padded scores.
    max_sequence_size : int, optional
        Maximum sequence size for SequenceExample protobuf
        The protobuf object will be padded or clipped to this value

    Returns
    -------
    `tf.function`
        Serving signature function that accepts a TFRecord string tensor and returns predictions
    """

    # TFRecord Signature
    # Define a parsing function for tfrecord protos
    inputs = feature_config.get_all_features(key="node_name",
                                             include_label=False)
    """
    NOTE:
    Setting pad_sequence=False for tfrecord signature as it is used at inference time
    and we do NOT want to score on padded records for performance reasons

    Limitation: This limits the serving signature to only run inference on a single query
    at a time given the current implementation. This is a tricky issue to fix because
    there is no real way to generate a dense tensor of ranking scores from different queries,
    as they might have varying number of records in each of them.

    Workaround: To infer on multiple queries, run predict() on each of the queries separately.
    """

    tfrecord_parse_fn = get_parse_fn(
        feature_config=feature_config,
        tfrecord_type=tfrecord_type,
        preprocessing_keys_to_fns=preprocessing_keys_to_fns,
        max_sequence_size=max_sequence_size,
        required_fields_only=required_fields_only,
        pad_sequence=pad_sequence,
    )

    dtype_map = dict()
    for feature_info in feature_config.get_all_features(include_label=False):
        feature_node_name = feature_info.get("node_name", feature_info["name"])
        dtype_map[feature_node_name] = feature_config.get_dtype(feature_info)

    # Define a serving signature for tfrecord
    @tf.function(input_signature=[TensorSpec(shape=[None], dtype=tf.string)])
    def _serve_tfrecord(protos):
        input_size = tf.shape(protos)[0]
        features_dict = {
            feature: TensorArray(dtype=dtype_map[feature], size=input_size)
            for feature in inputs
        }

        # Define loop index
        i = tf.constant(0)

        # Define loop condition
        def loop_condition(i, protos, features_dict):
            return tf.less(i, input_size)

        # Define loop body
        def loop_body(i, protos, features_dict):
            features, labels = tfrecord_parse_fn(protos[i])
            for feature, feature_val in features.items():
                features_dict[feature] = features_dict[feature].write(
                    i, feature_val)

            i += 1

            return i, protos, features_dict

        # Parse all SequenceExample protos to get features
        _, _, features_dict = tf.while_loop(
            cond=loop_condition,
            body=loop_body,
            loop_vars=[i, protos, features_dict],
        )

        # Convert TensorArray to tensor
        features_dict = {k: v.stack() for k, v in features_dict.items()}

        # Run the model to get predictions
        predictions = model(inputs=features_dict)

        # Define a post hook
        if postprocessing_fn:
            predictions = postprocessing_fn(predictions, features_dict)

        return predictions

    return _serve_tfrecord