예제 #1
0
파일: pipeline.py 프로젝트: Ullimague/ml4ir
    def __init__(self, args: Namespace):
        self.args = args

        # Generate Run ID
        if len(self.args.run_id) > 0:
            self.run_id: str = self.args.run_id
        else:
            self.run_id = "-".join(
                [socket.gethostname(),
                 time.strftime("%Y%m%d-%H%M%S")])
        self.start_time = time.time()

        self.logs_dir: str = os.path.join(self.args.logs_dir, self.run_id)

        # Setup logging
        file_io.make_directory(self.logs_dir, clear_dir=True, log=None)
        self.logger: Logger = self.setup_logging()
        self.logger.info("Logging initialized. Saving logs to : {}".format(
            self.logs_dir))
        self.logger.info("Run ID: {}".format(self.run_id))
        self.logger.info("CLI args: \n{}".format(
            json.dumps(vars(self.args)).replace(",", "\n")))

        # Setup directories
        self.models_dir: str = os.path.join(self.args.models_dir, self.run_id)
        self.data_dir: str = self.args.data_dir
        file_io.make_directory(self.models_dir,
                               clear_dir=False,
                               log=self.logger)

        # Read/Parse model config YAML
        self.model_config = self._read_model_config(self.args.model_config)

        # Setup other arguments
        self.loss: str = self.args.loss
        self.scoring: str = self.args.scoring
        self.optimizer: str = self.args.optimizer
        if self.args.metrics[0] == "[":
            self.metrics: List[str] = ast.literal_eval(self.args.metrics)
        else:
            self.metrics = [self.args.metrics]
        self.data_format: str = self.args.data_format

        # Validate args
        self.validate_args()

        # Set random seeds
        self.set_seeds()

        # Load and parse feature config
        self.feature_config: FeatureConfig = parse_config(
            self.args.feature_config, logger=self.logger)
        self.logger.info("Feature config parsed and loaded")

        # Finished initialization
        self.logger.info("Ranking Pipeline successfully initialized!")
예제 #2
0
    def run_default_pipeline(self, data_dir: str, data_format: str, feature_config_path: str):
        """Train a model with the default set of args"""
        feature_config: FeatureConfig = parse_config(feature_config_path)

        self.args.metrics = ["MRR"]

        ranking_dataset = RankingDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            max_num_records=self.args.max_num_records,
            loss_key=self.args.loss,
            scoring_key=self.args.scoring,
            batch_size=self.args.batch_size,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            logger=self.logger,
        )
        ranking_model = RankingModel(
            model_config=copy.deepcopy(self.model_config),
            loss_key=self.args.loss,
            scoring_key=self.args.scoring,
            metrics_keys=self.args.metrics,
            optimizer_key=self.args.optimizer,
            feature_config=feature_config,
            max_num_records=self.args.max_num_records,
            model_file=self.args.model_file,
            learning_rate=self.args.learning_rate,
            learning_rate_decay=self.args.learning_rate_decay,
            learning_rate_decay_steps=self.args.learning_rate_decay_steps,
            compute_intermediate_stats=self.args.compute_intermediate_stats,
            logger=self.logger,
        )

        ranking_model.fit(dataset=ranking_dataset, num_epochs=1, models_dir=self.output_dir)

        loss = dict(
            zip(
                ranking_model.model.metrics_names,
                ranking_model.model.evaluate(ranking_dataset.test),
            )
        )["loss"]
        new_MRR = ranking_model.evaluate(
            test_dataset=ranking_dataset.test, logs_dir=self.args.logs_dir,
        )[0]["new_MRR"]

        return loss, new_MRR
예제 #3
0
    def run_default_pipeline(self, data_dir: str, data_format: str,
                             feature_config_path: str):
        """Train a model with the default set of args"""
        feature_config: FeatureConfig = parse_config(feature_config_path)

        self.args.metrics = ["categorical_accuracy", "MRR", "ACR"]

        ranking_dataset = RankingDataset(
            data_dir=data_dir,
            data_format=data_format,
            feature_config=feature_config,
            max_num_records=self.args.max_num_records,
            loss_key=self.args.loss,
            scoring_key=self.args.scoring,
            batch_size=self.args.batch_size,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            logger=self.logger,
        )
        ranking_model = RankingModel(
            model_config=self.model_config,
            loss_key=self.args.loss,
            scoring_key=self.args.scoring,
            metrics_keys=self.args.metrics,
            optimizer_key=self.args.optimizer,
            feature_config=feature_config,
            max_num_records=self.args.max_num_records,
            model_file=self.args.model_file,
            learning_rate=self.args.learning_rate,
            learning_rate_decay=self.args.learning_rate_decay,
            learning_rate_decay_steps=self.args.learning_rate_decay_steps,
            compute_intermediate_stats=self.args.compute_intermediate_stats,
            logger=self.logger,
        )

        overall_metrics, _ = ranking_model.evaluate(
            test_dataset=ranking_dataset.test,
            logs_dir=self.args.logs_dir,
        )

        return overall_metrics.to_dict()
예제 #4
0
    def run_default_pipeline(self, data_dir: str, data_format: str,
                             feature_config_path: str):
        """Train a model with the default set of args"""
        feature_config = features.parse_config(feature_config_path)

        self.args.metrics = ["categorical_accuracy"]

        ranking_dataset = RankingDataset(
            data_dir=data_dir,
            data_format=data_format,
            features=feature_config,
            max_num_records=self.args.max_num_records,
            loss_key=self.args.loss,
            scoring_key=self.args.scoring,
            batch_size=self.args.batch_size,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            logger=self.logger,
        )
        ranking_model = RankingModel(
            architecture_key=self.args.architecture,
            loss_key=self.args.loss,
            scoring_key=self.args.scoring,
            metrics_keys=self.args.metrics,
            optimizer_key=self.args.optimizer,
            features=feature_config,
            max_num_records=self.args.max_num_records,
            model_file=self.args.model_file,
            learning_rate=self.args.learning_rate,
            learning_rate_decay=self.args.learning_rate_decay,
            compute_intermediate_stats=self.args.compute_intermediate_stats,
            logger=self.logger,
        )

        ranking_model.fit(dataset=ranking_dataset,
                          num_epochs=1,
                          models_dir=self.output_dir)

        loss, accuracy = ranking_model.evaluate(ranking_dataset.test)

        return loss, accuracy
예제 #5
0
    def get_ranking_dataset(self, data_dir: str, data_format: str,
                            feature_config_path: str):

        feature_config = features.parse_config(feature_config_path)

        ranking_dataset = RankingDataset(
            data_dir=data_dir,
            data_format=data_format,
            features=feature_config,
            max_num_records=self.args.max_num_records,
            loss_key=self.args.loss,
            scoring_key=self.args.scoring,
            batch_size=self.args.batch_size,
            train_pcent_split=self.args.train_pcent_split,
            val_pcent_split=self.args.val_pcent_split,
            test_pcent_split=self.args.test_pcent_split,
            logger=self.logger,
        )

        return ranking_dataset
예제 #6
0
def main(argv):
    """Convert CSV files into tfrecord SequenceExample files"""

    # Define script arguments
    parser = ArgumentParser(
        description="Process arguments for ml4ir ranking pipeline.")

    parser.add_argument("--csv_dir",
                        type=str,
                        default=None,
                        help="Path to the data directory containing CSV files")
    parser.add_argument("--csv_file",
                        type=str,
                        default=None,
                        help="Path to the CSV file to convert")
    parser.add_argument(
        "--tfrecord_dir",
        type=str,
        default=None,
        help="Path to the output directory to write TFRecord files",
    )
    parser.add_argument(
        "--tfrecord_file",
        type=str,
        default=None,
        help="Path to the output file to write TFRecord data",
    )
    parser.add_argument(
        "--feature_config",
        type=str,
        default=None,
        help="Path to feature config JSON file or feature config JSON string",
    )
    parser.add_argument(
        "--convert_single_files",
        type=bool,
        default=False,
        help="Whether to convert each CSV file individually"
        "All occurences of a query key should be within a single file",
    )
    args = parser.parse_args(argv)

    # Get all CSV files to be converted
    if args.csv_dir:
        csv_files: List[str] = glob.glob(os.path.join(args.csv_dir, "*.csv"))
    else:
        csv_files: List[str] = [args.csv_file]

    feature_config: FeatureConfig = parse_config(args.feature_config)

    # Setup logging
    logger: Logger = setup_logging()

    # Convert to TFRecord SequenceExample protobufs and save
    file_count = 0
    if args.convert_single_files:
        # Convert each CSV file individually - better performance
        for csv_file in csv_files:
            if args.tfrecord_dir:
                tfrecord_file: str = os.path.join(
                    args.tfrecord_dir, "file_{}.tfrecord".format(file_count))
            else:
                tfrecord_file: str = args.tfrecord_file

            write(
                csv_files=[csv_file],
                tfrecord_file=tfrecord_file,
                feature_config=feature_config,
                logger=logger,
            )

            file_count += 1
    else:
        # Convert all CSV files at once - expensive groupby operation
        if args.tfrecord_dir:
            tfrecord_file: str = os.path.join(
                args.tfrecord_dir, "file_{}.tfrecord".format(file_count))
        else:
            tfrecord_file: str = args.tfrecord_file

        write(
            csv_files=csv_files,
            tfrecord_file=tfrecord_file,
            feature_config=feature_config,
            logger=logger,
        )
예제 #7
0
    def test_model_serving(self):
        """
        Train a simple model and test serving flow by loading the SavedModel
        """

        # Test model training on TFRecord SequenceExample data
        data_dir = os.path.join(self.root_data_dir, "tfrecord")
        feature_config_path = os.path.join(self.root_data_dir, "tfrecord",
                                           self.feature_config_fname)

        feature_config: FeatureConfig = parse_config(feature_config_path)

        self.args.metrics = ["categorical_accuracy"]

        def get_dataset(parse_tfrecord):
            return RankingDataset(
                data_dir=data_dir,
                data_format="tfrecord",
                feature_config=feature_config,
                max_num_records=self.args.max_num_records,
                loss_key=self.args.loss,
                scoring_key=self.args.scoring,
                batch_size=self.args.batch_size,
                train_pcent_split=self.args.train_pcent_split,
                val_pcent_split=self.args.val_pcent_split,
                test_pcent_split=self.args.test_pcent_split,
                parse_tfrecord=parse_tfrecord,
                logger=self.logger,
            )

        # Get raw TFRecord dataset
        raw_dataset = get_dataset(parse_tfrecord=False)

        # Parse the raw TFRecord dataset
        parsed_dataset = get_dataset(parse_tfrecord=True)

        model = RankingModel(
            model_config=self.model_config,
            loss_key=self.args.loss,
            scoring_key=self.args.scoring,
            metrics_keys=self.args.metrics,
            optimizer_key=self.args.optimizer,
            feature_config=feature_config,
            max_num_records=self.args.max_num_records,
            model_file=self.args.model_file,
            learning_rate=self.args.learning_rate,
            learning_rate_decay=self.args.learning_rate_decay,
            learning_rate_decay_steps=self.args.learning_rate_decay_steps,
            compute_intermediate_stats=self.args.compute_intermediate_stats,
            logger=self.logger,
        )

        model.fit(dataset=parsed_dataset,
                  num_epochs=1,
                  models_dir=self.output_dir)

        model.save(models_dir=self.args.models_dir)

        # Load SavedModel and get the right serving signature
        default_model = kmodels.load_model(os.path.join(
            self.output_dir, "final", "default"),
                                           compile=False)
        assert ServingSignatureKey.DEFAULT in default_model.signatures
        default_signature = default_model.signatures[
            ServingSignatureKey.DEFAULT]

        tfrecord_model = kmodels.load_model(os.path.join(
            self.output_dir, "final", "tfrecord"),
                                            compile=False)
        assert ServingSignatureKey.TFRECORD in tfrecord_model.signatures
        tfrecord_signature = tfrecord_model.signatures[
            ServingSignatureKey.TFRECORD]

        # Fetch a single batch for testing
        sequence_example_protos = next(iter(raw_dataset.test))
        parsed_sequence_examples = {
            k: tf.cast(v, tf.float32)
            for k, v in next(iter(parsed_dataset.test))[0].items()
        }
        parsed_dataset_batch = parsed_dataset.test.take(1)

        # Use the loaded serving signatures for inference
        model_predictions = model.predict(
            parsed_dataset_batch)["new_score"].values
        default_signature_predictions = default_signature(
            **parsed_sequence_examples)["ranking_scores"]
        tfrecord_signature_predictions = tfrecord_signature(
            sequence_example_protos=sequence_example_protos)["ranking_scores"]

        def _flatten_records(x):
            """Collapse first two dimensions of a tensor -> [batch_size, max_num_records]"""
            return tf.reshape(x, tf.concat([[-1], tf.shape(x)[2:]], axis=0))

        def _filter_records(x, mask):
            """
            Filter records that were padded in each query

            Input shape: [batch_size, num_features]

            Output shape: [batch_size, num_features]
            """
            return tf.squeeze(tf.gather_nd(x, tf.where(tf.not_equal(mask, 0))))

        # Get mask for padded values
        mask = _flatten_records(parsed_sequence_examples["mask"])

        # Flatten scores to each record and filter out scores from padded records
        default_signature_predictions = _filter_records(
            _flatten_records(default_signature_predictions), mask)
        tfrecord_signature_predictions = _filter_records(
            _flatten_records(tfrecord_signature_predictions), mask)

        # Compare the scores from the different versions of the model
        assert np.isclose(
            model_predictions,
            default_signature_predictions,
            rtol=0.01,
        ).all()

        assert np.isclose(
            model_predictions,
            tfrecord_signature_predictions,
            rtol=0.01,
        ).all()

        assert np.isclose(
            default_signature_predictions,
            tfrecord_signature_predictions,
            rtol=0.01,
        ).all()