def test_training_with_warm_start(self): """ Training with a user provided model for warm start. """ # Get trainer object, but only train 1 L-BFGS step. binary_lr_trainer = BinaryLogisticRegressionTrainer(lambda_l2=0.0, max_iter=1) coefficients_warm_start = binary_lr_trainer.fit( X=self.x_train, y=self.y_train, weights=None, offsets=None, theta_initial=self.custom_weights)[0] # Warm start. # The trained model should be close to initial value # since the solution should have already converged. self.assertAllClose(coefficients_warm_start, self.custom_weights, rtol=_TOLERANCE, atol=_TOLERANCE, msg='models mismatch') coefficients_code_start = binary_lr_trainer.fit(X=self.x_train, y=self.y_train, weights=None, offsets=None, theta_initial=None)[0] # Code start # The trained model should be far from initial value since we only train 1 step, # while the initial model was trained for 100 steps. self.assertNotAllClose(coefficients_code_start, self.custom_weights, msg='models are too close')
def __init__(self, consumer_id, regularize_bias=False, lambda_l2=1.0, tolerance=1e-8, num_of_curvature_pairs=10, num_iterations=100): self.consumer_id = consumer_id self.lr_trainer = BinaryLogisticRegressionTrainer(regularize_bias=regularize_bias, lambda_l2=lambda_l2, precision=tolerance/np.finfo(float).eps, num_lbfgs_corrections=num_of_curvature_pairs, max_iter=num_iterations) self.processed_counter = 0
def test_scoring_should_fail_if_not_trained(self): """ Inference should fail on untrained model """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() with self.assertRaises(Exception): self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None)
def test_scoring_should_succeed_if_custom_weights_provided(self): """ Inference should succeed on untrained model if custom weights provided """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() validation_pred = self.binary_lr_trainer.predict_proba( X=self.x_test, offsets=None, custom_theta=self.custom_weights) assert (validation_pred.shape[0] == self.x_test.shape[0])
def test_metrics_computation_should_fail_if_model_not_trained(self): """ Metrics computation should fail on untrained model """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() with self.assertRaises(Exception): self.binary_lr_trainer.compute_metrics(X=self.x_test, y=self.y_test, offsets=None)
def test_metrics_computation_should_succeed_if_custom_weights_provided(self): """ Metrics computation should succeed on untrained model if custom weights provided """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() validation_metrics = self.binary_lr_trainer.compute_metrics(X=self.x_test, y=self.y_test, offsets=None, custom_theta=self.custom_weights) assert (0.0 <= validation_metrics['auc'] <= 1.0)
def test_scoring_should_fail_if_custom_weights_not_of_known_type(self): """ Inference should fail if custom weights are neither Numpy ndarray or Scipy sparse amtrix """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() # Run inference using a Python list, which is neither a numpy ndarray nor a scipy matrix with self.assertRaises(Exception): self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None, custom_theta=self.custom_weights.tolist())
def setUp(self): # Since grid machines may or may not have access to internet, # using a pickled instance of popular open-source breast cancer dataset for testing sample_dataset = pickle.load(open(sample_dataset_path + "/sklearn_data.p", "rb")) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(sample_dataset.data, sample_dataset.target, test_size=0.25, random_state=0) self.binary_lr_trainer = BinaryLogisticRegressionTrainer() self.custom_weights = self.binary_lr_trainer.fit(X=self.x_train, y=self.y_train, weights=None, offsets=None)[0]
def _predict(self, pool, input_path, metadata, tensor_metadata, output_file, schema_params, num_features, metadata_file, model_weights, use_local_index): logger.info(f"Start inference for {input_path}.") # Create LR model object for inference lr_model = BinaryLogisticRegressionTrainer(regularize_bias=True, lambda_l2=self.model_params.l2_reg_weight) consumer = InferenceJobConsumer(lr_model, num_features, schema_params, use_local_index, name=input_path) results = self._pooled_action(pool, consumer, input_path, schema_params, model_weights, num_features, metadata_file, gen_index_map=False) # Set up output schema output_schema = fastavro.parse_schema(get_inference_output_avro_schema( metadata, has_logits_per_coordinate=True, # Always true for custom scipy-based LR schema_params=schema_params, has_weight=any(schema_params.sample_weight == feature.name for feature in tensor_metadata.get_features()))) batched_write_avro(itertools.chain.from_iterable(results), output_file, output_schema) logger.info(f"Inference complete: {input_path}.")
class TrainingJobConsumer: """ Callable class to consume entity-based random effect training jobs from a shared queue """ _CONSUMER_LOGGING_FREQUENCY = 1000 def __init__(self, consumer_id, regularize_bias=False, lambda_l2=1.0, tolerance=1e-8, num_of_curvature_pairs=10, num_iterations=100): self.consumer_id = consumer_id self.lr_trainer = BinaryLogisticRegressionTrainer(regularize_bias=regularize_bias, lambda_l2=lambda_l2, precision=tolerance/np.finfo(float).eps, num_lbfgs_corrections=num_of_curvature_pairs, max_iter=num_iterations) self.processed_counter = 0 def __call__(self, training_job_queue, training_results_dict, get_timeout_in_seconds=300): """ Call method to read training jobs off of a shared queue :param training_job_queue: Shared multiprocessing job queue :param training_results_dict: Shared dictionary to store training results :param get_timeout_in_seconds: Timeout (in seconds) for retrieving items off the shared job queue :return: None """ logger.info("Kicking off training job consumer with ID : {}".format(self.consumer_id)) while True: # Extract TrainingJob object training_job = training_job_queue.get(True, get_timeout_in_seconds) # If producer is done producing jobs, terminate consumer if training_job is None: logger.info("Terminating consumer {}".format(self.consumer_id)) break # Train model training_result = self.lr_trainer.fit(X=training_job.X, y=training_job.y, weights=training_job.weights, offsets=training_job.offsets) # Map trained model to entity ID training_results_dict[training_job.entity_id] = TrainingResult(training_result=training_result[0], unique_global_indices=training_job. unique_global_indices) self.processed_counter += 1 if self.processed_counter % TrainingJobConsumer._CONSUMER_LOGGING_FREQUENCY == 0: logger.info("Consumer job {} has completed {} training jobs so far".format(self.consumer_id, self.processed_counter))
def _train(self, pool, input_path, metadata_file, model_weights: dict, num_features, schema_params, output_model_file): logger.info(f"Start training with {f'loaded {len(model_weights)} previous models' if model_weights else 'zeros'} as the model initial value.") lr_model = BinaryLogisticRegressionTrainer(regularize_bias=self.model_params.regularize_bias, lambda_l2=self.model_params.l2_reg_weight, precision=self.model_params.lbfgs_tolerance / np.finfo(float).eps, num_lbfgs_corrections=self.model_params.num_of_lbfgs_curvature_pairs, max_iter=self.model_params.num_of_lbfgs_iterations) consumer = TrainingJobConsumer(lr_model, name=input_path) results = self._pooled_action(pool, consumer, input_path, schema_params, model_weights, num_features, metadata_file, self.model_params.enable_local_indexing) model_weights.update(results) logger.info(f"{len(model_weights)} models in total after training/refreshing.") # Dump results to model output directory. if self.model_params.feature_file: self._save_model(output_model_file, model_coefficients=model_weights, num_features=num_features, feature_file=self.model_params.feature_file) else: logger.info("Both feature file and avro model output directory required to export model. Skipping export") return model_weights
def _train(self, pool, input_path, metadata_file, model_weights: dict, num_features, schema_params, output_model_file): logger.info( f"Start training with {f'loaded {len(model_weights)} previous models' if model_weights else 'zeros'} as the model initial value." ) lr_model = BinaryLogisticRegressionTrainer( regularize_bias=self.model_params.regularize_bias, lambda_l2=self.model_params.l2_reg_weight, precision=self.model_params.lbfgs_tolerance / np.finfo(float).eps, num_lbfgs_corrections=self.model_params. num_of_lbfgs_curvature_pairs, max_iter=self.model_params.num_of_lbfgs_iterations, has_intercept=self.has_intercept) consumer = TrainingJobConsumer( lr_model, name=input_path, job_queue=self.job_queue, enable_local_indexing=self.model_params.enable_local_indexing, sparsity_threshold=self.model_params.sparsity_threshold, variance_mode=self.model_params.random_effect_variance_mode) # Make sure the queue is empty assert (self.job_queue.empty()) results = self._pooled_action(pool, consumer, input_path, schema_params, model_weights, num_features, metadata_file, self.model_params.enable_local_indexing) # The trained model should be updated by the prior model to cover the following two cases: # (1) the prior model has extra features that are not present in the current datasets. # (2) the prior model has extra model_ids that are not present in the current datasets. # In both cases, the extra features/model_id needs to be copied to the current models. # This is needed especially incremental learning is implemented. # It is not needed when the prior model and current model share the same features / model_ids. # Revisit this when we start working on more advanced warm start. model_weights.update(results) logger.info( f"{len(model_weights)} models in total after training/refreshing.") # Dump results to model output directory. self._save_model(output_model_file, model_coefficients=model_weights, num_features=num_features, feature_file=self.feature_file) return model_weights
def _predict(self, inference_dataset, model_coefficients, metadata, tensor_metadata, output_file, prediction_params): # Create LR trainer object for inference lr_trainer = BinaryLogisticRegressionTrainer( regularize_bias=True, lambda_l2=self.model_params[constants.L2_REG_WEIGHT]) # Create PhotonMLWriter object prediction_params.update(self.model_params) inference_runner = PhotonMLWriter(schema_params=prediction_params) # Delegate inference to PhotonMLWriter object inference_runner.run_custom_scipy_re_inference( inference_dataset=inference_dataset, model_coefficients=model_coefficients, lr_model=lr_trainer, metadata=metadata, tensor_metadata=tensor_metadata, output_file=output_file)
def _predict(self, pool, input_path, metadata, tensor_metadata, output_file, schema_params, num_features, metadata_file, model_weights): logger.info(f"Start inference for {input_path}.") # Create LR model object for inference lr_model = BinaryLogisticRegressionTrainer( regularize_bias=True, lambda_l2=self.model_params.l2_reg_weight, has_intercept=self.has_intercept) consumer = InferenceJobConsumer(lr_model, num_features, schema_params, name=input_path, job_queue=self.job_queue) # Make sure the queue is empty assert (self.job_queue.empty()) # Prediction does not use local indexing since it can work on sparse coefficients directly. results = self._pooled_action(pool, consumer, input_path, schema_params, model_weights, num_features, metadata_file, enable_local_indexing=False) # Set up output schema output_schema = fastavro.parse_schema( get_inference_output_avro_schema( metadata, has_logits_per_coordinate= True, # Always true for custom scipy-based LR schema_params=schema_params, has_weight=any(schema_params.weight_column_name == feature.name for feature in tensor_metadata.get_features()))) batched_write_avro(itertools.chain.from_iterable(results), output_file, output_schema) logger.info(f"Inference complete: {input_path}.")
class TestBinaryLogisticRegressionTrainer(tf.test.TestCase): """ Test binary logistic regression trainer """ def setUp(self): # Since grid machines may or may not have access to internet, # using a pickled instance of popular open-source breast cancer dataset for testing sample_dataset = pickle.load(open(sample_dataset_path + "/sklearn_data.p", "rb")) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(sample_dataset.data, sample_dataset.target, test_size=0.25, random_state=0) self.binary_lr_trainer = BinaryLogisticRegressionTrainer() self.custom_weights = self.binary_lr_trainer.fit(X=self.x_train, y=self.y_train, weights=None, offsets=None)[0] def test_on_dense_dataset(self): """ Test training on a dense dataset """ # Train on sample data self.binary_lr_trainer.fit(X=self.x_train, y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the training data training_pred = self.binary_lr_trainer.predict_proba(X=self.x_train, offsets=None) training_metrics = self.binary_lr_trainer.compute_metrics(X=self.x_train, y=self.y_train, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= training_metrics['auc'] <= 1.0) assert (training_pred.shape[0] == self.x_train.shape[0]) def test_on_sparse_dataset(self): """ Test training on a sparse dataset """ # Train on sparsified sample data self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train), y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the training data training_pred = self.binary_lr_trainer.predict_proba(X=sparse.csr_matrix(self.x_train), offsets=None) training_metrics = self.binary_lr_trainer.compute_metrics(X=sparse.csr_matrix(self.x_train), y=self.y_train, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= training_metrics['auc'] <= 1.0) assert (training_pred.shape[0] == self.x_train.shape[0]) def test_scoring_on_validation_data(self): """ Test inference and metrics computation """ # Train on sample data self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train), y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the test data validation_pred = self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None) validation_metrics = self.binary_lr_trainer.compute_metrics(X=self.x_test, y=self.y_test, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= validation_metrics['auc'] <= 1.0) assert (validation_pred.shape[0] == self.x_test.shape[0]) def test_scoring_should_fail_if_not_trained(self): """ Inference should fail on untrained model """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() with self.assertRaises(Exception): self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None) def test_scoring_should_fail_if_custom_weights_not_of_known_type(self): """ Inference should fail if custom weights are neither Numpy ndarray or Scipy sparse amtrix """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() # Run inference using a Python list, which is neither a numpy ndarray nor a scipy matrix with self.assertRaises(Exception): self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None, custom_theta=self.custom_weights.tolist()) def test_metrics_computation_should_fail_if_model_not_trained(self): """ Metrics computation should fail on untrained model """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() with self.assertRaises(Exception): self.binary_lr_trainer.compute_metrics(X=self.x_test, y=self.y_test, offsets=None) def test_scoring_should_succeed_if_custom_weights_provided(self): """ Inference should succeed on untrained model if custom weights provided """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() validation_pred = self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None, custom_theta=self.custom_weights) assert (validation_pred.shape[0] == self.x_test.shape[0]) def test_metrics_computation_should_succeed_if_custom_weights_provided(self): """ Metrics computation should succeed on untrained model if custom weights provided """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() validation_metrics = self.binary_lr_trainer.compute_metrics(X=self.x_test, y=self.y_test, offsets=None, custom_theta=self.custom_weights) assert (0.0 <= validation_metrics['auc'] <= 1.0)
class TestBinaryLogisticRegressionTrainer(tf.test.TestCase): """ Test binary logistic regression trainer """ def setUp(self): # Since grid machines may or may not have access to internet, # using a pickled instance of popular open-source breast cancer dataset for testing sample_dataset = pickle.load( open(sample_dataset_path + "/sklearn_data.p", "rb")) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( sample_dataset.data, sample_dataset.target, test_size=0.25, random_state=0) self.binary_lr_trainer = BinaryLogisticRegressionTrainer(max_iter=500) self.custom_weights = self.binary_lr_trainer.fit(X=self.x_train, y=self.y_train, weights=None, offsets=None)[0] def test_on_dense_dataset(self): """ Test training on a dense dataset """ # Train on sample data self.binary_lr_trainer.fit(X=self.x_train, y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the training data training_pred = self.binary_lr_trainer.predict_proba(X=self.x_train, offsets=None) training_metrics = self.binary_lr_trainer.compute_metrics( X=self.x_train, y=self.y_train, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= training_metrics['auc'] <= 1.0) assert (training_pred.shape[0] == self.x_train.shape[0]) def test_on_sparse_dataset(self): """ Test training on a sparse dataset """ # Train on sparsified sample data self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train), y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the training data training_pred = self.binary_lr_trainer.predict_proba( X=sparse.csr_matrix(self.x_train), offsets=None) training_metrics = self.binary_lr_trainer.compute_metrics( X=sparse.csr_matrix(self.x_train), y=self.y_train, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= training_metrics['auc'] <= 1.0) assert (training_pred.shape[0] == self.x_train.shape[0]) def test_scoring_on_validation_data(self): """ Test inference and metrics computation """ # Train on sample data self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train), y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the test data validation_pred = self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None) validation_metrics = self.binary_lr_trainer.compute_metrics( X=self.x_test, y=self.y_test, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= validation_metrics['auc'] <= 1.0) assert (validation_pred.shape[0] == self.x_test.shape[0]) def test_scoring_should_fail_if_not_trained(self): """ Inference should fail on untrained model """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() with self.assertRaises(Exception): self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None) def test_scoring_should_fail_if_custom_weights_not_of_known_type(self): """ Inference should fail if custom weights are neither Numpy ndarray or Scipy sparse amtrix """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() # Run inference using a Python list, which is neither a numpy ndarray nor a scipy matrix with self.assertRaises(Exception): self.binary_lr_trainer.predict_proba( X=self.x_test, offsets=None, custom_theta=self.custom_weights.tolist()) def test_metrics_computation_should_fail_if_model_not_trained(self): """ Metrics computation should fail on untrained model """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() with self.assertRaises(Exception): self.binary_lr_trainer.compute_metrics(X=self.x_test, y=self.y_test, offsets=None) def test_scoring_should_succeed_if_custom_weights_provided(self): """ Inference should succeed on untrained model if custom weights provided """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() validation_pred = self.binary_lr_trainer.predict_proba( X=self.x_test, offsets=None, custom_theta=self.custom_weights) assert (validation_pred.shape[0] == self.x_test.shape[0]) def test_metrics_computation_should_succeed_if_custom_weights_provided( self): """ Metrics computation should succeed on untrained model if custom weights provided """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() validation_metrics = self.binary_lr_trainer.compute_metrics( X=self.x_test, y=self.y_test, offsets=None, custom_theta=self.custom_weights) assert (0.0 <= validation_metrics['auc'] <= 1.0) def test_training_with_warm_start(self): """ Training with a user provided model for warm start. """ # Get trainer object, but only train 1 L-BFGS step. binary_lr_trainer = BinaryLogisticRegressionTrainer(lambda_l2=0.0, max_iter=1) coefficients_warm_start = binary_lr_trainer.fit( X=self.x_train, y=self.y_train, weights=None, offsets=None, theta_initial=self.custom_weights)[0] # Warm start. # The trained model should be close to initial value # since the solution should have already converged. self.assertAllClose(coefficients_warm_start, self.custom_weights, rtol=_TOLERANCE, atol=_TOLERANCE, msg='models mismatch') coefficients_code_start = binary_lr_trainer.fit(X=self.x_train, y=self.y_train, weights=None, offsets=None, theta_initial=None)[0] # Code start # The trained model should be far from initial value since we only train 1 step, # while the initial model was trained for 100 steps. self.assertNotAllClose(coefficients_code_start, self.custom_weights, msg='models are too close')
class TestBinaryLogisticRegressionTrainer(tf.test.TestCase): """ Test binary logistic regression trainer """ def setUp(self): # Since grid machines may or may not have access to internet, # using a pickled instance of popular open-source breast cancer dataset for testing sample_dataset = pickle.load( open(sample_dataset_path + "/sklearn_data.p", "rb")) self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( sample_dataset.data, sample_dataset.target, test_size=0.25, random_state=0) self.binary_lr_trainer = BinaryLogisticRegressionTrainer(max_iter=1000) self.binary_lr_trainer_without_bias = BinaryLogisticRegressionTrainer( max_iter=1000, has_intercept=False) self.custom_weights = self.binary_lr_trainer.fit(X=self.x_train, y=self.y_train, weights=None, offsets=None)[0][0] def test_on_dense_dataset(self): """ Test training on a dense dataset """ # Train on sample data self.binary_lr_trainer.fit(X=self.x_train, y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the training data training_pred = self.binary_lr_trainer.predict_proba(X=self.x_train, offsets=None) training_metrics = self.binary_lr_trainer.compute_metrics( X=self.x_train, y=self.y_train, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= training_metrics['auc'] <= 1.0) assert (training_pred.shape[0] == self.x_train.shape[0]) def test_on_sparse_dataset(self): """ Test training on a sparse dataset """ # Train on sparsified sample data self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train), y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the training data training_pred = self.binary_lr_trainer.predict_proba( X=sparse.csr_matrix(self.x_train), offsets=None) training_metrics = self.binary_lr_trainer.compute_metrics( X=sparse.csr_matrix(self.x_train), y=self.y_train, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= training_metrics['auc'] <= 1.0) assert (training_pred.shape[0] == self.x_train.shape[0]) def test_on_sparse_dataset_without_bias(self): """ Test training on a sparse dataset """ # Train on sparsified sample data self.binary_lr_trainer_without_bias.fit(X=sparse.csr_matrix( self.x_train), y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the training data training_pred = self.binary_lr_trainer_without_bias.predict_proba( X=sparse.csr_matrix(self.x_train), offsets=None) training_metrics = self.binary_lr_trainer_without_bias.compute_metrics( X=sparse.csr_matrix(self.x_train), y=self.y_train, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= training_metrics['auc'] <= 1.0) assert (training_pred.shape[0] == self.x_train.shape[0]) def test_scoring_on_validation_data(self): """ Test inference and metrics computation """ # Train on sample data self.binary_lr_trainer.fit(X=sparse.csr_matrix(self.x_train), y=self.y_train, weights=None, offsets=None) # Get predictions and metrics on the test data validation_pred = self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None) validation_metrics = self.binary_lr_trainer.compute_metrics( X=self.x_test, y=self.y_test, offsets=None) # Assert prediction shape matches expectation, and training metrics are within expected range assert (0.0 <= validation_metrics['auc'] <= 1.0) assert (validation_pred.shape[0] == self.x_test.shape[0]) def test_scoring_should_fail_if_not_trained(self): """ Inference should fail on untrained model """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() with self.assertRaises(Exception): self.binary_lr_trainer.predict_proba(X=self.x_test, offsets=None) def test_scoring_should_fail_if_custom_weights_not_of_known_type(self): """ Inference should fail if custom weights are neither Numpy ndarray or Scipy sparse amtrix """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() # Run inference using a Python list, which is neither a numpy ndarray nor a scipy matrix with self.assertRaises(Exception): self.binary_lr_trainer.predict_proba( X=self.x_test, offsets=None, custom_theta=self.custom_weights.tolist()) def test_metrics_computation_should_fail_if_model_not_trained(self): """ Metrics computation should fail on untrained model """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() with self.assertRaises(Exception): self.binary_lr_trainer.compute_metrics(X=self.x_test, y=self.y_test, offsets=None) def test_scoring_should_succeed_if_custom_weights_provided(self): """ Inference should succeed on untrained model if custom weights provided """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() validation_pred = self.binary_lr_trainer.predict_proba( X=self.x_test, offsets=None, custom_theta=self.custom_weights) assert (validation_pred.shape[0] == self.x_test.shape[0]) def test_metrics_computation_should_succeed_if_custom_weights_provided( self): """ Metrics computation should succeed on untrained model if custom weights provided """ # Reset trainer object self.binary_lr_trainer = BinaryLogisticRegressionTrainer() validation_metrics = self.binary_lr_trainer.compute_metrics( X=self.x_test, y=self.y_test, offsets=None, custom_theta=self.custom_weights) assert (0.0 <= validation_metrics['auc'] <= 1.0) def test_training_with_warm_start(self): """ Training with a user provided model for warm start. """ # Get trainer object, but only train 1 L-BFGS step. binary_lr_trainer = BinaryLogisticRegressionTrainer(lambda_l2=0.0, max_iter=1) coefficients_warm_start = binary_lr_trainer.fit( X=self.x_train, y=self.y_train, weights=None, offsets=None, theta_initial=self.custom_weights)[0][0] # Warm start. # The trained model should be close to initial value # since the solution should have already converged. self.assertAllClose(coefficients_warm_start, self.custom_weights, rtol=_TOLERANCE, atol=_TOLERANCE, msg='models mismatch') coefficients_cold_start = binary_lr_trainer.fit( X=self.x_train, y=self.y_train, weights=None, offsets=None, theta_initial=None)[0][0] # Cold start # The trained model should be far from initial value since we only train 1 step, # while the initial model was trained for 100 steps. self.assertNotAllClose(coefficients_cold_start, self.custom_weights, msg='models are too close') def test_fit_with_variance_computation(self): """ Test fit when the variance computation is required """ # Generate the dataset num_features = 10 num_samples = 100 X = np.random.randn(num_samples, num_features) y = np.random.randint(2, size=num_samples) weights = np.random.rand(num_samples) offsets = np.random.randn(num_samples) lambda_l2 = 0.0 binary_lr_trainer = BinaryLogisticRegressionTrainer( lambda_l2=lambda_l2, max_iter=1000, regularize_bias=True) expected_simple = compute_coefficients_and_variance( X=X, y=y, weights=weights, offsets=offsets, variance_mode=constants.SIMPLE, lambda_l2=lambda_l2) expected_full = compute_coefficients_and_variance( X=X, y=y, weights=weights, offsets=offsets, variance_mode=constants.FULL, lambda_l2=lambda_l2) actual_simple = binary_lr_trainer.fit(X=sparse.csr_matrix(X), y=y, weights=weights, offsets=offsets, variance_mode=constants.SIMPLE) actual_full = binary_lr_trainer.fit(X=sparse.csr_matrix(X), y=y, weights=weights, offsets=offsets, variance_mode=constants.FULL) self.assertAllClose(expected_simple[0], actual_simple[0][0], rtol=1e-02, atol=1e-02, msg='simple mean mismatch') self.assertAllClose(expected_simple[1], actual_simple[1], rtol=1e-02, atol=1e-02, msg='simple variance mismatch') self.assertAllClose(expected_full[0], actual_full[0][0], rtol=1e-02, atol=1e-02, msg='full mean mismatch') self.assertAllClose(expected_full[1], actual_full[1], rtol=1e-02, atol=1e-02, msg='full variance mismatch') def test_fit_with_variance_computation_without_intercept(self): """ Test fit when the variance computation is required but no intercept is used """ # Generate the dataset num_features = 10 num_samples = 100 X = np.random.randn(num_samples, num_features) y = np.random.randint(2, size=num_samples) weights = np.random.rand(num_samples) offsets = np.random.randn(num_samples) lambda_l2 = 0.0 binary_lr_trainer = BinaryLogisticRegressionTrainer( lambda_l2=lambda_l2, max_iter=1000, regularize_bias=True, has_intercept=False) expected_simple = compute_coefficients_and_variance( X=X, y=y, weights=weights, offsets=offsets, variance_mode=constants.SIMPLE, lambda_l2=lambda_l2, has_intercept=False) expected_full = compute_coefficients_and_variance( X=X, y=y, weights=weights, offsets=offsets, variance_mode=constants.FULL, lambda_l2=lambda_l2, has_intercept=False) actual_simple = binary_lr_trainer.fit(X=sparse.csr_matrix(X), y=y, weights=weights, offsets=offsets, variance_mode=constants.SIMPLE) actual_full = binary_lr_trainer.fit(X=sparse.csr_matrix(X), y=y, weights=weights, offsets=offsets, variance_mode=constants.FULL) self.assertAllClose(expected_simple[0], actual_simple[0][0], rtol=1e-02, atol=1e-02, msg='simple mean mismatch') self.assertAllClose(expected_simple[1], actual_simple[1], rtol=1e-02, atol=1e-02, msg='simple variance mismatch') self.assertAllClose(expected_full[0], actual_full[0][0], rtol=1e-02, atol=1e-02, msg='full mean mismatch') self.assertAllClose(expected_full[1], actual_full[1], rtol=1e-02, atol=1e-02, msg='full variance mismatch')
def test_fit_with_variance_computation_without_intercept(self): """ Test fit when the variance computation is required but no intercept is used """ # Generate the dataset num_features = 10 num_samples = 100 X = np.random.randn(num_samples, num_features) y = np.random.randint(2, size=num_samples) weights = np.random.rand(num_samples) offsets = np.random.randn(num_samples) lambda_l2 = 0.0 binary_lr_trainer = BinaryLogisticRegressionTrainer( lambda_l2=lambda_l2, max_iter=1000, regularize_bias=True, has_intercept=False) expected_simple = compute_coefficients_and_variance( X=X, y=y, weights=weights, offsets=offsets, variance_mode=constants.SIMPLE, lambda_l2=lambda_l2, has_intercept=False) expected_full = compute_coefficients_and_variance( X=X, y=y, weights=weights, offsets=offsets, variance_mode=constants.FULL, lambda_l2=lambda_l2, has_intercept=False) actual_simple = binary_lr_trainer.fit(X=sparse.csr_matrix(X), y=y, weights=weights, offsets=offsets, variance_mode=constants.SIMPLE) actual_full = binary_lr_trainer.fit(X=sparse.csr_matrix(X), y=y, weights=weights, offsets=offsets, variance_mode=constants.FULL) self.assertAllClose(expected_simple[0], actual_simple[0][0], rtol=1e-02, atol=1e-02, msg='simple mean mismatch') self.assertAllClose(expected_simple[1], actual_simple[1], rtol=1e-02, atol=1e-02, msg='simple variance mismatch') self.assertAllClose(expected_full[0], actual_full[0][0], rtol=1e-02, atol=1e-02, msg='full mean mismatch') self.assertAllClose(expected_full[1], actual_full[1], rtol=1e-02, atol=1e-02, msg='full variance mismatch')