def test_training_with_warm_start(self):
        """
        Training with a user provided model for warm start.
        """
        # Get trainer object, but only train 1 L-BFGS step.
        binary_lr_trainer = BinaryLogisticRegressionTrainer(lambda_l2=0.0,
                                                            max_iter=1)
        coefficients_warm_start = binary_lr_trainer.fit(
            X=self.x_train,
            y=self.y_train,
            weights=None,
            offsets=None,
            theta_initial=self.custom_weights)[0]
        # Warm start.
        # The trained model should be close to initial value
        # since the solution should have already converged.
        self.assertAllClose(coefficients_warm_start,
                            self.custom_weights,
                            rtol=_TOLERANCE,
                            atol=_TOLERANCE,
                            msg='models mismatch')

        coefficients_code_start = binary_lr_trainer.fit(X=self.x_train,
                                                        y=self.y_train,
                                                        weights=None,
                                                        offsets=None,
                                                        theta_initial=None)[0]
        # Code start
        # The trained model should be far from initial value since we only train 1 step,
        # while the initial model was trained for 100 steps.
        self.assertNotAllClose(coefficients_code_start,
                               self.custom_weights,
                               msg='models are too close')
 def test_scoring_should_fail_if_not_trained(self):
     """
     Inference should fail on untrained model
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     with self.assertRaises(Exception):
         self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None)
예제 #3
0
 def __init__(self, consumer_id, regularize_bias=False, lambda_l2=1.0, tolerance=1e-8, num_of_curvature_pairs=10,
              num_iterations=100):
     self.consumer_id = consumer_id
     self.lr_trainer = BinaryLogisticRegressionTrainer(regularize_bias=regularize_bias, lambda_l2=lambda_l2,
                                                       precision=tolerance/np.finfo(float).eps,
                                                       num_lbfgs_corrections=num_of_curvature_pairs,
                                                       max_iter=num_iterations)
     self.processed_counter = 0
 def test_scoring_should_succeed_if_custom_weights_provided(self):
     """
     Inference should succeed on untrained model if custom weights provided
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     validation_pred = self.binary_lr_trainer.predict_proba(
         X=self.x_test, offsets=None, custom_theta=self.custom_weights)
     assert (validation_pred.shape[0] == self.x_test.shape[0])
    def setUp(self):
        # Since grid machines may or may not have access to internet,
        # using a pickled instance of popular open-source breast cancer dataset for testing
        sample_dataset = pickle.load(
            open(sample_dataset_path + "/sklearn_data.p", "rb"))
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            sample_dataset.data,
            sample_dataset.target,
            test_size=0.25,
            random_state=0)

        self.binary_lr_trainer = BinaryLogisticRegressionTrainer(max_iter=1000)
        self.binary_lr_trainer_without_bias = BinaryLogisticRegressionTrainer(
            max_iter=1000, has_intercept=False)
        self.custom_weights = self.binary_lr_trainer.fit(X=self.x_train,
                                                         y=self.y_train,
                                                         weights=None,
                                                         offsets=None)[0][0]
 def test_metrics_computation_should_fail_if_model_not_trained(self):
     """
     Metrics computation should fail on untrained model
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     with self.assertRaises(Exception):
         self.binary_lr_trainer.compute_metrics(X=self.x_test,
                                                y=self.y_test,
                                                offsets=None)
 def test_metrics_computation_should_succeed_if_custom_weights_provided(self):
     """
     Metrics computation should succeed on untrained model if custom weights provided
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     validation_metrics = self.binary_lr_trainer.compute_metrics(X=self.x_test,
                                                                 y=self.y_test,
                                                                 offsets=None,
                                                                 custom_theta=self.custom_weights)
     assert (0.0 <= validation_metrics['auc'] <= 1.0)
 def test_scoring_should_fail_if_custom_weights_not_of_known_type(self):
     """
     Inference should fail if custom weights are neither Numpy ndarray or Scipy sparse amtrix
     """
     # Reset trainer object
     self.binary_lr_trainer = BinaryLogisticRegressionTrainer()
     # Run inference using a Python list, which is neither a numpy ndarray nor a scipy matrix
     with self.assertRaises(Exception):
         self.binary_lr_trainer.predict_proba(X=self.x_test,
                                              offsets=None,
                                              custom_theta=self.custom_weights.tolist())
    def _predict(self, pool, input_path, metadata, tensor_metadata, output_file, schema_params, num_features, metadata_file, model_weights, use_local_index):
        logger.info(f"Start inference for {input_path}.")
        # Create LR model object for inference
        lr_model = BinaryLogisticRegressionTrainer(regularize_bias=True, lambda_l2=self.model_params.l2_reg_weight)
        consumer = InferenceJobConsumer(lr_model, num_features, schema_params, use_local_index, name=input_path)

        results = self._pooled_action(pool, consumer, input_path, schema_params, model_weights, num_features, metadata_file, gen_index_map=False)

        # Set up output schema
        output_schema = fastavro.parse_schema(get_inference_output_avro_schema(
            metadata,
            has_logits_per_coordinate=True,  # Always true for custom scipy-based LR
            schema_params=schema_params,
            has_weight=any(schema_params.sample_weight == feature.name for feature in tensor_metadata.get_features())))
        batched_write_avro(itertools.chain.from_iterable(results), output_file, output_schema)
        logger.info(f"Inference complete: {input_path}.")
예제 #10
0
 def _train(self, pool, input_path, metadata_file, model_weights: dict, num_features, schema_params, output_model_file):
     logger.info(f"Start training with {f'loaded {len(model_weights)} previous models' if model_weights else 'zeros'} as the model initial value.")
     lr_model = BinaryLogisticRegressionTrainer(regularize_bias=self.model_params.regularize_bias, lambda_l2=self.model_params.l2_reg_weight,
                                                precision=self.model_params.lbfgs_tolerance / np.finfo(float).eps,
                                                num_lbfgs_corrections=self.model_params.num_of_lbfgs_curvature_pairs,
                                                max_iter=self.model_params.num_of_lbfgs_iterations)
     consumer = TrainingJobConsumer(lr_model, name=input_path)
     results = self._pooled_action(pool, consumer, input_path, schema_params, model_weights, num_features, metadata_file,
                                   self.model_params.enable_local_indexing)
     model_weights.update(results)
     logger.info(f"{len(model_weights)} models in total after training/refreshing.")
     # Dump results to model output directory.
     if self.model_params.feature_file:
         self._save_model(output_model_file, model_coefficients=model_weights, num_features=num_features, feature_file=self.model_params.feature_file)
     else:
         logger.info("Both feature file and avro model output directory required to export model. Skipping export")
     return model_weights
예제 #11
0
 def _train(self, pool, input_path, metadata_file, model_weights: dict,
            num_features, schema_params, output_model_file):
     logger.info(
         f"Start training with {f'loaded {len(model_weights)} previous models' if model_weights else 'zeros'} as the model initial value."
     )
     lr_model = BinaryLogisticRegressionTrainer(
         regularize_bias=self.model_params.regularize_bias,
         lambda_l2=self.model_params.l2_reg_weight,
         precision=self.model_params.lbfgs_tolerance / np.finfo(float).eps,
         num_lbfgs_corrections=self.model_params.
         num_of_lbfgs_curvature_pairs,
         max_iter=self.model_params.num_of_lbfgs_iterations,
         has_intercept=self.has_intercept)
     consumer = TrainingJobConsumer(
         lr_model,
         name=input_path,
         job_queue=self.job_queue,
         enable_local_indexing=self.model_params.enable_local_indexing,
         sparsity_threshold=self.model_params.sparsity_threshold,
         variance_mode=self.model_params.random_effect_variance_mode)
     # Make sure the queue is empty
     assert (self.job_queue.empty())
     results = self._pooled_action(pool, consumer, input_path,
                                   schema_params, model_weights,
                                   num_features, metadata_file,
                                   self.model_params.enable_local_indexing)
     # The trained model should be updated by the prior model to cover the following two cases:
     # (1) the prior model has extra features that are not present in the current datasets.
     # (2) the prior model has extra model_ids that are not present in the current datasets.
     # In both cases, the extra features/model_id needs to be copied to the current models.
     # This is needed especially incremental learning is implemented.
     # It is not needed when the prior model and current model share the same features / model_ids.
     # Revisit this when we start working on more advanced warm start.
     model_weights.update(results)
     logger.info(
         f"{len(model_weights)} models in total after training/refreshing.")
     # Dump results to model output directory.
     self._save_model(output_model_file,
                      model_coefficients=model_weights,
                      num_features=num_features,
                      feature_file=self.feature_file)
     return model_weights
    def _predict(self, inference_dataset, model_coefficients, metadata,
                 tensor_metadata, output_file, prediction_params):

        # Create LR trainer object for inference
        lr_trainer = BinaryLogisticRegressionTrainer(
            regularize_bias=True,
            lambda_l2=self.model_params[constants.L2_REG_WEIGHT])

        # Create PhotonMLWriter object
        prediction_params.update(self.model_params)
        inference_runner = PhotonMLWriter(schema_params=prediction_params)

        # Delegate inference to PhotonMLWriter object
        inference_runner.run_custom_scipy_re_inference(
            inference_dataset=inference_dataset,
            model_coefficients=model_coefficients,
            lr_model=lr_trainer,
            metadata=metadata,
            tensor_metadata=tensor_metadata,
            output_file=output_file)
예제 #13
0
    def _predict(self, pool, input_path, metadata, tensor_metadata,
                 output_file, schema_params, num_features, metadata_file,
                 model_weights):
        logger.info(f"Start inference for {input_path}.")
        # Create LR model object for inference
        lr_model = BinaryLogisticRegressionTrainer(
            regularize_bias=True,
            lambda_l2=self.model_params.l2_reg_weight,
            has_intercept=self.has_intercept)
        consumer = InferenceJobConsumer(lr_model,
                                        num_features,
                                        schema_params,
                                        name=input_path,
                                        job_queue=self.job_queue)
        # Make sure the queue is empty
        assert (self.job_queue.empty())
        # Prediction does not use local indexing since it can work on sparse coefficients directly.
        results = self._pooled_action(pool,
                                      consumer,
                                      input_path,
                                      schema_params,
                                      model_weights,
                                      num_features,
                                      metadata_file,
                                      enable_local_indexing=False)

        # Set up output schema
        output_schema = fastavro.parse_schema(
            get_inference_output_avro_schema(
                metadata,
                has_logits_per_coordinate=
                True,  # Always true for custom scipy-based LR
                schema_params=schema_params,
                has_weight=any(schema_params.weight_column_name == feature.name
                               for feature in tensor_metadata.get_features())))
        batched_write_avro(itertools.chain.from_iterable(results), output_file,
                           output_schema)
        logger.info(f"Inference complete: {input_path}.")
    def test_fit_with_variance_computation_without_intercept(self):
        """
        Test fit when the variance computation is required but no intercept is used
        """
        # Generate the dataset
        num_features = 10
        num_samples = 100
        X = np.random.randn(num_samples, num_features)
        y = np.random.randint(2, size=num_samples)
        weights = np.random.rand(num_samples)
        offsets = np.random.randn(num_samples)
        lambda_l2 = 0.0
        binary_lr_trainer = BinaryLogisticRegressionTrainer(
            lambda_l2=lambda_l2,
            max_iter=1000,
            regularize_bias=True,
            has_intercept=False)
        expected_simple = compute_coefficients_and_variance(
            X=X,
            y=y,
            weights=weights,
            offsets=offsets,
            variance_mode=constants.SIMPLE,
            lambda_l2=lambda_l2,
            has_intercept=False)

        expected_full = compute_coefficients_and_variance(
            X=X,
            y=y,
            weights=weights,
            offsets=offsets,
            variance_mode=constants.FULL,
            lambda_l2=lambda_l2,
            has_intercept=False)

        actual_simple = binary_lr_trainer.fit(X=sparse.csr_matrix(X),
                                              y=y,
                                              weights=weights,
                                              offsets=offsets,
                                              variance_mode=constants.SIMPLE)

        actual_full = binary_lr_trainer.fit(X=sparse.csr_matrix(X),
                                            y=y,
                                            weights=weights,
                                            offsets=offsets,
                                            variance_mode=constants.FULL)
        self.assertAllClose(expected_simple[0],
                            actual_simple[0][0],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='simple mean mismatch')
        self.assertAllClose(expected_simple[1],
                            actual_simple[1],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='simple variance mismatch')
        self.assertAllClose(expected_full[0],
                            actual_full[0][0],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='full mean mismatch')
        self.assertAllClose(expected_full[1],
                            actual_full[1],
                            rtol=1e-02,
                            atol=1e-02,
                            msg='full variance mismatch')