def test_create_deepnull_prediction(self, target_is_binary): size = 1000 design_df = _create_df(target_is_binary=target_is_binary, size=size) design_df['FID'] = np.arange(size) design_df['IID'] = np.arange(size) design_df['unused_str_column'] = np.random.choice( list('abcdefg'), size) input_df = design_df.copy(deep=True) full_config = config.get_config(config.DEEPNULL) full_config.model_config.mlp_units = (32, 16) full_config.training_config.num_epochs = 1 full_config.training_config.batch_size = 32 with tempfile.TemporaryDirectory() as tmpdir: final_df, _, test_perf_df = train_eval.create_deepnull_prediction( input_df=input_df, target='label', target_is_binary=target_is_binary, covariates=['cov1', 'cov2'], full_config=full_config, prediction_column='label_deepnull', num_folds=3, seed=5, logdir=tmpdir, verbosity=0) pd.testing.assert_frame_equal(design_df, input_df) self.assertCountEqual(final_df.columns, list(design_df.columns) + ['label_deepnull']) pd.testing.assert_frame_equal(design_df, final_df[design_df.columns]) self.assertCountEqual( test_perf_df.columns, ['IID', 'label', 'label_deepnull', 'label_deepnull_eval_fold'])
def test_deepnull_model_compiles(self, cls): full_config = config.get_config(config.DEEPNULL) model = cls(target='target', covariates=['cov1', 'cov2'], full_config=full_config, fold_ix=0) self.assertIsInstance(model, model_lib._DeepNull)
def test_get_model(self, config_name, binary, expected): full_config = config.get_config(config_name) actual = model_lib.get_model(target='target', target_is_binary=binary, covariates=['cov1', 'cov2'], full_config=full_config, fold_ix=0, logdir='/tmp', seed=1) self.assertIsInstance(actual, expected)
def test_xgboost_model_fit_and_predict(self, cls, metric, target, expected): train_df, eval_df = _create_test_data() full_config = config.get_config(config.XGBOOST) model = cls(target=target, covariates=['cov1', 'cov2'], full_config=full_config) model.fit(train_df=train_df, eval_df=eval_df, verbosity=0) actual_df = model.predict(df=eval_df, prediction_column='xgboost_prediction') actual_metric = metric(eval_df[target], actual_df['xgboost_prediction']) self.assertEqual(actual_df.shape, (200, 2)) self.assertEqual(list(actual_df.columns), ['IID', 'xgboost_prediction']) self.assertAlmostEqual(actual_metric, expected)
def main(argv: Sequence[str]) -> None: if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') tf.random.set_seed(_SEED.value) logging.info('Loading data from %s', _INPUT_TSV.value) input_df, binary_col_map = data.load_plink_or_bolt_file( path_or_buf=_INPUT_TSV.value, missing_value=_MISSING_VALUE.value) if FLAGS.model_config is None: full_config = config.get_config('deepnull') else: full_config = FLAGS.model_config logging.info('Training DeepNull model on %s with model %s', _TARGET.value, full_config) final_df, eval_metrics, _ = train_eval.create_deepnull_prediction( input_df=input_df, target=_TARGET.value, target_is_binary=_TARGET.value in binary_col_map, covariates=_COVARIATES.value, full_config=full_config, prediction_column=_PREDS_COL.value, num_folds=_NUM_FOLDS.value, seed=_SEED.value, logdir=_LOGDIR.value, # Level 2 is printing once per epoch during training. verbosity=2 if _VERBOSE.value else 0, ) if not metrics.acceptable_model_performance(eval_metrics): logging.warning( 'WARNING: data folds have substantially different performance. Consider' ' retraining model with a different seed.') logging.info('Writing trained results to %s', _OUTPUT_TSV.value) data.write_plink_or_bolt_file(input_df=final_df, path_or_buf=_OUTPUT_TSV.value, binary_column_mapping=binary_col_map, missing_value=_MISSING_VALUE.value, cast_ints=True)
def test_deepnull_model_fit_and_predict(self, cls, metric, target, expected): train_df, eval_df = _create_test_data() full_config = config.get_config(config.DEEPNULL) full_config.model_config.mlp_units = (32, 16) full_config.training_config.num_epochs = 2 full_config.training_config.batch_size = 200 model = cls(target=target, covariates=['cov1', 'cov2'], full_config=full_config, fold_ix=0) tf.random.set_seed(42) model.fit(train_df=train_df, eval_df=eval_df, verbosity=0) actual_df = model.predict(df=eval_df, prediction_column='deepnull_prediction') actual_metric = metric(eval_df[target], actual_df['deepnull_prediction']) self.assertEqual(actual_df.shape, (200, 2)) self.assertEqual(list(actual_df.columns), ['IID', 'deepnull_prediction']) self.assertAlmostEqual(actual_metric, expected)
def test_unsupported_model(self): bad = 'unsupported_model' with self.assertRaisesRegex( ValueError, f'Config "{bad}" is not a supported model'): config.get_config(bad)
def test_supported_model(self): valid_config = config.get_config(config.DEEPNULL) self.assertEqual(valid_config.model_type, config.DEEPNULL)