def test_roundtrip_featureset(tmpdir): fset_path = os.path.join(str(tmpdir), 'test.npz') for n_channels in [1, 3]: for labels in [['class1', 'class2'], []]: fset, labels = sample_featureset(3, n_channels, ['amplitude'], labels, names=['a', 'b', 'c'], meta_features=['meta1']) pred_probs = pd.DataFrame(np.random.random((len(fset), 2)), index=fset.index.values, columns=['class1', 'class2']) featurize.save_featureset(fset, fset_path, labels=labels, pred_probs=pred_probs) fset_loaded, data_loaded = featurize.load_featureset(fset_path) npt.assert_allclose(fset.values, fset_loaded.values) npt.assert_array_equal(fset.index, fset_loaded.index) npt.assert_array_equal(fset.columns, fset_loaded.columns) assert isinstance(fset_loaded, pd.DataFrame) npt.assert_array_equal(labels, data_loaded['labels']) npt.assert_allclose(pred_probs, data_loaded['pred_probs']) npt.assert_array_equal(pred_probs.columns, data_loaded['pred_probs'].columns)
def create_test_featureset(project, label_type='class'): """Create and yield test labeled featureset, then delete. Parameters ---------- project : `models.Project` instance The project under which to create test feature set. label_type : {'class', 'regr', 'none'}, optional String indicating whether data are labeled with class names ('class') for classification, numerical values for regression ('regr'), or unlabeled ('none'). Defaults to 'class'. """ if label_type == 'class': targets = ['Mira', 'Classical_Cepheid'] elif label_type == 'regr': targets = [2.2, 3.4, 4.4, 2.2, 3.1] elif label_type == 'none': targets = [] features_to_use = (CADENCE_FEATS + GENERAL_FEATS + LOMB_SCARGLE_FEATS) fset_data = fixtures.sample_featureset(5, 1, features_to_use, targets) fset_path = pjoin(cfg['paths']['features_folder'], '{}.nc'.format(str(uuid.uuid4()))) fset_data.to_netcdf(fset_path, engine=cfg['xr_engine']) f, created = m.File.create_or_get(uri=fset_path) fset = m.Featureset.create(name='test_featureset', file=f, project=project, features_list=features_to_use, custom_features_script=None, finished=datetime.datetime.now()) fset.save() try: yield fset finally: fset.delete_instance()
def test_fit_multichannel(): """Test model building helper function for multi-channel feature data.""" fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model = build_model.build_model_from_featureset(fset, model) assert isinstance(model, RandomForestClassifier)
def test_model_regression(): """Test model prediction function: regression""" fset = sample_featureset(10, 1, ["amplitude"], [0.1, 0.5]) model = build_model.build_model_from_featureset(fset, model_type="RandomForestRegressor") preds = predict.model_predictions(fset, model) assert all(preds.name == fset.name) assert preds.prediction.values.dtype == np.dtype("float")
def test_model_regression(): """Test model prediction function: regression""" fset = sample_featureset(10, 1, ['amplitude'], [0.1, 0.5]) model = build_model.build_model_from_featureset( fset, model_type='RandomForestRegressor') preds = predict.model_predictions(fset, model) assert (all(preds.name == fset.name)) assert (preds.prediction.values.dtype == np.dtype('float'))
def test_score_model(): """Test calculation of model training score.""" fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model = build_model.build_model_from_featureset(fset, model) score = build_model.score_model(model, fset) assert isinstance(score, float)
def test_model_regression(): """Test model prediction function: classification""" fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2']) fset.target.values = np.random.random(len(fset.target.values)) model = build_model.build_model_from_featureset( fset, model_type='RandomForestRegressor') preds = predict.model_predictions(fset, model) assert (all(preds.name == fset.name)) assert (preds.prediction.values.dtype == np.dtype('float'))
def test_from_netcdf(): fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'], labels=['a', 'b', 'c']) data_dir = tempfile.mkdtemp() fset.to_netcdf(pjoin(data_dir, 'test.nc')) loaded = featureset.from_netcdf(pjoin(data_dir, 'test.nc')) assert isinstance(loaded, Featureset) assert set(fset.data_vars) == set(loaded.data_vars) assert set(fset.coords) == set(loaded.coords)
def test_predict_optimized_model(): """Test main predict function (classification) w/ optimized model""" fset = sample_featureset(10, 1, ["amplitude"], ["class1", "class2"]) model = build_model.build_model_from_featureset( fset, model_type="RandomForestClassifier", params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2 ) preds = predict.model_predictions(fset, model) assert all(preds.name == fset.name) assert preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target))) assert preds.prediction.values.dtype == np.dtype("float")
def test_indexing(): fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'], labels=['a', 'b', 'c']) """Test indexing overloading (__getattr__).""" assert all(fset[0] == fset.isel(name=0)) assert all(fset[0:2] == fset.isel(name=[0, 1])) assert all(fset['a'] == fset.sel(name='a')) assert all(fset[['a', 'b']] == fset.sel(name=['a', 'b'])) npt.assert_allclose(fset['amplitude'].values.ravel(), fset.data_vars['amplitude'].values.ravel())
def test_model_classification(): """Test model prediction function: classification""" fset = sample_featureset(10, 1, ["amplitude"], ["class1", "class2"]) model = build_model.build_model_from_featureset(fset, model_type="RandomForestClassifier") preds = predict.model_predictions(fset, model) assert all(preds.name == fset.name) assert preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target))) assert preds.prediction.values.dtype == np.dtype("float") classes = predict.model_predictions(fset, model, return_probs=False) assert all(classes.name == fset.name) assert classes.prediction.values.shape == (len(fset.name),) assert isinstance(classes.prediction.values[0], (str, bytes))
def test_predict_optimized_model(): """Test main predict function (classification) w/ optimized model""" fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2']) model = build_model.build_model_from_featureset( fset, model_type='RandomForestClassifier', params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2) preds = predict.model_predictions(fset, model) assert (all(preds.name == fset.name)) assert (preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target)))) assert (preds.prediction.values.dtype == np.dtype('float'))
def test_impute(): """Test imputation of missing Featureset values.""" fset, labels = sample_featureset(5, 1, ['amplitude'], ['class1', 'class2'], names=['a', 'b', 'c', 'd', 'e'], meta_features=['meta1']) imputed = featurize.impute_featureset(fset) npt.assert_allclose(fset.amplitude.values, imputed.amplitude.values) assert isinstance(imputed, pd.DataFrame) fset.amplitude.values[0] = np.inf fset.amplitude.values[1] = np.nan amp_values = fset.amplitude.values[2:] other_values = fset.values.T.ravel()[2:] imputed = featurize.impute_featureset(fset, strategy='constant', value=None) npt.assert_allclose(-2 * np.nanmax(np.abs(other_values)), imputed.amplitude.values[0:2]) imputed = featurize.impute_featureset(fset, strategy='constant', value=-1e4) npt.assert_allclose(-1e4, imputed.amplitude.values[0:2]) imputed = featurize.impute_featureset(fset, strategy='mean') npt.assert_allclose(np.mean(amp_values), imputed.amplitude.values[0:2]) npt.assert_allclose(amp_values, imputed.amplitude.values[2:]) imputed = featurize.impute_featureset(fset, strategy='median') npt.assert_allclose(np.median(amp_values), imputed.amplitude.values[0:2]) npt.assert_allclose(amp_values, imputed.amplitude.values[2:]) imputed = featurize.impute_featureset(fset, strategy='most_frequent') npt.assert_allclose( scipy.stats.mode(amp_values).mode.item(), imputed.amplitude.values[0:2]) npt.assert_allclose(amp_values, imputed.amplitude.values[2:]) featurize.impute_featureset(fset, strategy='constant', value=-1e4, inplace=True) npt.assert_allclose(-1e4, fset.amplitude.values[0:2]) with pytest.raises(NotImplementedError): featurize.impute_featureset(fset, strategy='blah')
def test_model_classification(): """Test model prediction function: classification""" fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2']) model = build_model.build_model_from_featureset( fset, model_type='RandomForestClassifier') preds = predict.model_predictions(fset, model) assert (all(preds.name == fset.name)) assert (preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target)))) assert (preds.prediction.values.dtype == np.dtype('float')) classes = predict.model_predictions(fset, model, return_probs=False) assert (all(classes.name == fset.name)) assert (classes.prediction.values.shape == (len(fset.name), )) assert (isinstance(classes.prediction.values[0], (str, bytes)))
def test_fit_existing_model_optimize(): """Test model building helper function - with param. optimization""" fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() model_options = {"criterion": "gini", "bootstrap": True} params_to_optimize = {"n_estimators": [10, 50, 100], "min_samples_split": [2, 5], "max_features": ["auto", 3]} model = build_model.build_model_from_featureset(fset, model, None, model_options, params_to_optimize) assert hasattr(model, "best_params_") assert hasattr(model, "predict_proba") assert isinstance(model, GridSearchCV) assert isinstance(model.best_estimator_, RandomForestClassifier)
def test_invalid_feature_values(): """Test proper exception handling for invalid feature values""" fset = sample_featureset(10, 1, ['x_valid', 'x_inf', 'x_nan'], ['class1', 'class2']) fset.x_inf.values[0, 0] = np.inf fset.x_nan.values[0, 0] = np.nan model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']() try: model = build_model.build_model_from_featureset(fset, model) except ValueError as e: assert 'x_valid' not in str(e) assert 'x_inf' in str(e) assert 'x_nan' in str(e) else: raise AssertionError("Exception should have been raised for invalid data.") model = build_model.build_model_from_featureset(fset.drop(['x_inf', 'x_nan']), model) assert isinstance(model, RandomForestClassifier)
def add_file(featureset, create, value, *args, **kwargs): if not create: return if 'class' in featureset.name: labels = ['Mira', 'Classical_Cepheid'] elif 'regr' in featureset.name: labels = [2.2, 3.4, 4.4, 2.2, 3.1] else: labels = [] fset_data, fset_labels = sample_featureset(5, 1, featureset.features_list, labels) fset_path = pjoin(TMP_DIR, '{}.npz'.format(str(uuid.uuid4()))) featurize.save_featureset(fset_data, fset_path, labels=fset_labels) featureset.file_uri = fset_path DBSession().commit()
def test_fit_optimize(): """Test hypeparameter optimization""" fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model_options = {"criterion": "gini", "bootstrap": True} model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']\ (**model_options) feature_df = build_model.rectangularize_featureset(fset) params_to_optimize = {"n_estimators": [10, 50, 100], "min_samples_split": [2, 5], "max_features": ["auto", 3]} model = build_model.fit_model_optimize_hyperparams(feature_df, fset['target'], model, params_to_optimize) assert hasattr(model, "best_params_") assert hasattr(model, "predict_proba") assert isinstance(model, GridSearchCV) assert isinstance(model.best_estimator_, RandomForestClassifier)
def test_fit_optimize(): """Test hypeparameter optimization""" fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'], ['class1', 'class2']) model_options = {"criterion": "gini", "bootstrap": True} model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']\ (**model_options) feature_df = fset.to_dataframe() params_to_optimize = {"n_estimators": [10, 50, 100], "min_samples_split": [2, 5], "max_features": ["auto", 3]} model = build_model.fit_model_optimize_hyperparams(feature_df, fset['target'], model, params_to_optimize) assert hasattr(model, "best_params_") assert hasattr(model, "predict_proba") assert isinstance(model, GridSearchCV) assert isinstance(model.best_estimator_, RandomForestClassifier)
def test_impute(): """Test imputation of missing Featureset values.""" fset, labels = sample_featureset(5, 1, ['amplitude'], ['class1', 'class2'], names=['a', 'b', 'c', 'd', 'e'], meta_features=['meta1']) imputed = featurize.impute_featureset(fset) npt.assert_allclose(fset.amplitude.values, imputed.amplitude.values) assert isinstance(imputed, pd.DataFrame) fset.amplitude.values[0] = np.inf fset.amplitude.values[1] = np.nan amp_values = fset.amplitude.values[2:] other_values = fset.values.T.ravel()[2:] imputed = featurize.impute_featureset(fset, strategy='constant', value=None) npt.assert_allclose(-2 * np.nanmax(np.abs(other_values)), imputed.amplitude.values[0:2]) imputed = featurize.impute_featureset(fset, strategy='constant', value=-1e4) npt.assert_allclose(-1e4, imputed.amplitude.values[0:2]) imputed = featurize.impute_featureset(fset, strategy='mean') npt.assert_allclose(np.mean(amp_values), imputed.amplitude.values[0:2]) npt.assert_allclose(amp_values, imputed.amplitude.values[2:]) imputed = featurize.impute_featureset(fset, strategy='median') npt.assert_allclose(np.median(amp_values), imputed.amplitude.values[0:2]) npt.assert_allclose(amp_values, imputed.amplitude.values[2:]) imputed = featurize.impute_featureset(fset, strategy='most_frequent') npt.assert_allclose(scipy.stats.mode(amp_values).mode.item(), imputed.amplitude.values[0:2]) npt.assert_allclose(amp_values, imputed.amplitude.values[2:]) featurize.impute_featureset(fset, strategy='constant', value=-1e4, inplace=True) npt.assert_allclose(-1e4, fset.amplitude.values[0:2]) with pytest.raises(NotImplementedError): featurize.impute_featureset(fset, strategy='blah')
def test_impute(): """Test imputation of missing Featureset values.""" fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'], labels=['a', 'b', 'c'], meta_features=['meta1']) imputed = fset.impute() npt.assert_allclose(fset.amplitude.values, imputed.amplitude.values) assert isinstance(imputed, Featureset) fset.amplitude.values[0, 0] = np.inf fset.amplitude.values[0, 1] = np.nan masked = Featureset(fset.where(abs(fset) < np.inf)) values = fset.amplitude.values[0, 2:] imputed = fset.impute(strategy='constant', value=None) abs_values = np.abs(np.array([v.values.ravel() for v in masked.data_vars.values()])) npt.assert_allclose(-2 * np.nanmax(abs_values), imputed.amplitude.values[0, 0:2]) assert isinstance(imputed, Featureset) imputed = fset.impute(strategy='constant', value=-1e4) npt.assert_allclose(-1e4, imputed.amplitude.values[0, 0:2]) assert isinstance(imputed, Featureset) imputed = fset.impute(strategy='mean') npt.assert_allclose(np.mean(values), imputed.amplitude.values[0, 0:2]) npt.assert_allclose(values, imputed.amplitude.values[0, 2:]) assert isinstance(imputed, Featureset) imputed = fset.impute(strategy='median') npt.assert_allclose(np.median(values), imputed.amplitude.values[0, 0:2]) npt.assert_allclose(values, imputed.amplitude.values[0, 2:]) assert isinstance(imputed, Featureset) imputed = fset.impute(strategy='most_frequent') npt.assert_allclose(scipy.stats.mode(values).mode.item(), imputed.amplitude.values[0, 0:2]) npt.assert_allclose(values, imputed.amplitude.values[0, 2:]) assert isinstance(imputed, Featureset)
def test_to_dataframe(): fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'], labels=['a', 'b', 'c']) df = fset.to_dataframe() npt.assert_allclose(fset['amplitude'].values.ravel(), df['amplitude']) assert 'target' not in df
def test_repr(): """Testing Featureset printing.""" fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum'], ['class1', 'class2']) repr(fset)