示例#1
0
def test_roundtrip_featureset(tmpdir):
    fset_path = os.path.join(str(tmpdir), 'test.npz')
    for n_channels in [1, 3]:
        for labels in [['class1', 'class2'], []]:
            fset, labels = sample_featureset(3,
                                             n_channels, ['amplitude'],
                                             labels,
                                             names=['a', 'b', 'c'],
                                             meta_features=['meta1'])

            pred_probs = pd.DataFrame(np.random.random((len(fset), 2)),
                                      index=fset.index.values,
                                      columns=['class1', 'class2'])

            featurize.save_featureset(fset,
                                      fset_path,
                                      labels=labels,
                                      pred_probs=pred_probs)
            fset_loaded, data_loaded = featurize.load_featureset(fset_path)
            npt.assert_allclose(fset.values, fset_loaded.values)
            npt.assert_array_equal(fset.index, fset_loaded.index)
            npt.assert_array_equal(fset.columns, fset_loaded.columns)
            assert isinstance(fset_loaded, pd.DataFrame)
            npt.assert_array_equal(labels, data_loaded['labels'])
            npt.assert_allclose(pred_probs, data_loaded['pred_probs'])
            npt.assert_array_equal(pred_probs.columns,
                                   data_loaded['pred_probs'].columns)
示例#2
0
def create_test_featureset(project, label_type='class'):
    """Create and yield test labeled featureset, then delete.

    Parameters
    ----------
    project : `models.Project` instance
        The project under which to create test feature set.
    label_type  : {'class', 'regr', 'none'}, optional
        String indicating whether data are labeled with class names ('class')
        for classification, numerical values for regression ('regr'), or
        unlabeled ('none'). Defaults to 'class'.

    """
    if label_type == 'class':
        targets = ['Mira', 'Classical_Cepheid']
    elif label_type == 'regr':
        targets = [2.2, 3.4, 4.4, 2.2, 3.1]
    elif label_type == 'none':
        targets = []
    features_to_use = (CADENCE_FEATS + GENERAL_FEATS + LOMB_SCARGLE_FEATS)
    fset_data = fixtures.sample_featureset(5, 1, features_to_use, targets)
    fset_path = pjoin(cfg['paths']['features_folder'],
                      '{}.nc'.format(str(uuid.uuid4())))
    fset_data.to_netcdf(fset_path, engine=cfg['xr_engine'])
    f, created = m.File.create_or_get(uri=fset_path)
    fset = m.Featureset.create(name='test_featureset', file=f, project=project,
                               features_list=features_to_use,
                               custom_features_script=None,
                               finished=datetime.datetime.now())
    fset.save()
    try:
        yield fset
    finally:
        fset.delete_instance()
示例#3
0
def test_fit_multichannel():
    """Test model building helper function for multi-channel feature data."""
    fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.build_model_from_featureset(fset, model)
    assert isinstance(model, RandomForestClassifier)
示例#4
0
def test_fit_multichannel():
    """Test model building helper function for multi-channel feature data."""
    fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.build_model_from_featureset(fset, model)
    assert isinstance(model, RandomForestClassifier)
示例#5
0
def create_test_featureset(project, label_type='class'):
    """Create and yield test labeled featureset, then delete.

    Parameters
    ----------
    project : `models.Project` instance
        The project under which to create test feature set.
    label_type  : {'class', 'regr', 'none'}, optional
        String indicating whether data are labeled with class names ('class')
        for classification, numerical values for regression ('regr'), or
        unlabeled ('none'). Defaults to 'class'.

    """
    if label_type == 'class':
        targets = ['Mira', 'Classical_Cepheid']
    elif label_type == 'regr':
        targets = [2.2, 3.4, 4.4, 2.2, 3.1]
    elif label_type == 'none':
        targets = []
    features_to_use = (CADENCE_FEATS + GENERAL_FEATS + LOMB_SCARGLE_FEATS)
    fset_data = fixtures.sample_featureset(5, 1, features_to_use, targets)
    fset_path = pjoin(cfg['paths']['features_folder'],
                      '{}.nc'.format(str(uuid.uuid4())))
    fset_data.to_netcdf(fset_path, engine=cfg['xr_engine'])
    f, created = m.File.create_or_get(uri=fset_path)
    fset = m.Featureset.create(name='test_featureset', file=f, project=project,
                               features_list=features_to_use,
                               custom_features_script=None,
                               finished=datetime.datetime.now())
    fset.save()
    try:
        yield fset
    finally:
        fset.delete_instance()
示例#6
0
def test_model_regression():
    """Test model prediction function: regression"""
    fset = sample_featureset(10, 1, ["amplitude"], [0.1, 0.5])
    model = build_model.build_model_from_featureset(fset, model_type="RandomForestRegressor")
    preds = predict.model_predictions(fset, model)
    assert all(preds.name == fset.name)
    assert preds.prediction.values.dtype == np.dtype("float")
示例#7
0
def test_model_regression():
    """Test model prediction function: regression"""
    fset = sample_featureset(10, 1, ['amplitude'], [0.1, 0.5])
    model = build_model.build_model_from_featureset(
        fset, model_type='RandomForestRegressor')
    preds = predict.model_predictions(fset, model)
    assert (all(preds.name == fset.name))
    assert (preds.prediction.values.dtype == np.dtype('float'))
示例#8
0
def test_score_model():
    """Test calculation of model training score."""
    fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.build_model_from_featureset(fset, model)
    score = build_model.score_model(model, fset)
    assert isinstance(score, float)
示例#9
0
def test_score_model():
    """Test calculation of model training score."""
    fset = sample_featureset(10, 1,
                             ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model = build_model.build_model_from_featureset(fset, model)
    score = build_model.score_model(model, fset)
    assert isinstance(score, float)
示例#10
0
def test_model_regression():
    """Test model prediction function: classification"""
    fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2'])
    fset.target.values = np.random.random(len(fset.target.values))
    model = build_model.build_model_from_featureset(
        fset, model_type='RandomForestRegressor')
    preds = predict.model_predictions(fset, model)
    assert (all(preds.name == fset.name))
    assert (preds.prediction.values.dtype == np.dtype('float'))
示例#11
0
def test_from_netcdf():
    fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'],
                             labels=['a', 'b', 'c'])
    data_dir = tempfile.mkdtemp()
    fset.to_netcdf(pjoin(data_dir, 'test.nc'))
    loaded = featureset.from_netcdf(pjoin(data_dir, 'test.nc'))
    assert isinstance(loaded, Featureset)
    assert set(fset.data_vars) == set(loaded.data_vars)
    assert set(fset.coords) == set(loaded.coords)
示例#12
0
def test_predict_optimized_model():
    """Test main predict function (classification) w/ optimized model"""
    fset = sample_featureset(10, 1, ["amplitude"], ["class1", "class2"])
    model = build_model.build_model_from_featureset(
        fset, model_type="RandomForestClassifier", params_to_optimize={"n_estimators": [10, 50, 100]}, cv=2
    )
    preds = predict.model_predictions(fset, model)
    assert all(preds.name == fset.name)
    assert preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target)))
    assert preds.prediction.values.dtype == np.dtype("float")
示例#13
0
def test_indexing():
    fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'],
                             labels=['a', 'b', 'c'])
    """Test indexing overloading (__getattr__)."""
    assert all(fset[0] == fset.isel(name=0))
    assert all(fset[0:2] == fset.isel(name=[0, 1]))
    assert all(fset['a'] == fset.sel(name='a'))
    assert all(fset[['a', 'b']] == fset.sel(name=['a', 'b']))
    npt.assert_allclose(fset['amplitude'].values.ravel(),
                        fset.data_vars['amplitude'].values.ravel())
示例#14
0
def test_model_classification():
    """Test model prediction function: classification"""
    fset = sample_featureset(10, 1, ["amplitude"], ["class1", "class2"])
    model = build_model.build_model_from_featureset(fset, model_type="RandomForestClassifier")
    preds = predict.model_predictions(fset, model)
    assert all(preds.name == fset.name)
    assert preds.prediction.values.shape == (len(fset.name), len(np.unique(fset.target)))
    assert preds.prediction.values.dtype == np.dtype("float")

    classes = predict.model_predictions(fset, model, return_probs=False)
    assert all(classes.name == fset.name)
    assert classes.prediction.values.shape == (len(fset.name),)
    assert isinstance(classes.prediction.values[0], (str, bytes))
示例#15
0
def test_predict_optimized_model():
    """Test main predict function (classification) w/ optimized model"""
    fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2'])
    model = build_model.build_model_from_featureset(
        fset,
        model_type='RandomForestClassifier',
        params_to_optimize={"n_estimators": [10, 50, 100]},
        cv=2)
    preds = predict.model_predictions(fset, model)
    assert (all(preds.name == fset.name))
    assert (preds.prediction.values.shape == (len(fset.name),
                                              len(np.unique(fset.target))))
    assert (preds.prediction.values.dtype == np.dtype('float'))
示例#16
0
def test_impute():
    """Test imputation of missing Featureset values."""
    fset, labels = sample_featureset(5,
                                     1, ['amplitude'], ['class1', 'class2'],
                                     names=['a', 'b', 'c', 'd', 'e'],
                                     meta_features=['meta1'])

    imputed = featurize.impute_featureset(fset)
    npt.assert_allclose(fset.amplitude.values, imputed.amplitude.values)
    assert isinstance(imputed, pd.DataFrame)

    fset.amplitude.values[0] = np.inf
    fset.amplitude.values[1] = np.nan
    amp_values = fset.amplitude.values[2:]
    other_values = fset.values.T.ravel()[2:]

    imputed = featurize.impute_featureset(fset,
                                          strategy='constant',
                                          value=None)
    npt.assert_allclose(-2 * np.nanmax(np.abs(other_values)),
                        imputed.amplitude.values[0:2])

    imputed = featurize.impute_featureset(fset,
                                          strategy='constant',
                                          value=-1e4)
    npt.assert_allclose(-1e4, imputed.amplitude.values[0:2])

    imputed = featurize.impute_featureset(fset, strategy='mean')
    npt.assert_allclose(np.mean(amp_values), imputed.amplitude.values[0:2])
    npt.assert_allclose(amp_values, imputed.amplitude.values[2:])

    imputed = featurize.impute_featureset(fset, strategy='median')
    npt.assert_allclose(np.median(amp_values), imputed.amplitude.values[0:2])
    npt.assert_allclose(amp_values, imputed.amplitude.values[2:])

    imputed = featurize.impute_featureset(fset, strategy='most_frequent')
    npt.assert_allclose(
        scipy.stats.mode(amp_values).mode.item(),
        imputed.amplitude.values[0:2])
    npt.assert_allclose(amp_values, imputed.amplitude.values[2:])

    featurize.impute_featureset(fset,
                                strategy='constant',
                                value=-1e4,
                                inplace=True)
    npt.assert_allclose(-1e4, fset.amplitude.values[0:2])

    with pytest.raises(NotImplementedError):
        featurize.impute_featureset(fset, strategy='blah')
示例#17
0
def test_model_classification():
    """Test model prediction function: classification"""
    fset = sample_featureset(10, 1, ['amplitude'], ['class1', 'class2'])
    model = build_model.build_model_from_featureset(
        fset, model_type='RandomForestClassifier')
    preds = predict.model_predictions(fset, model)
    assert (all(preds.name == fset.name))
    assert (preds.prediction.values.shape == (len(fset.name),
                                              len(np.unique(fset.target))))
    assert (preds.prediction.values.dtype == np.dtype('float'))

    classes = predict.model_predictions(fset, model, return_probs=False)
    assert (all(classes.name == fset.name))
    assert (classes.prediction.values.shape == (len(fset.name), ))
    assert (isinstance(classes.prediction.values[0], (str, bytes)))
示例#18
0
def test_fit_existing_model_optimize():
    """Test model building helper function - with param. optimization"""
    fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model_options = {"criterion": "gini", "bootstrap": True}
    params_to_optimize = {"n_estimators": [10, 50, 100],
                          "min_samples_split": [2, 5],
                          "max_features": ["auto", 3]}
    model = build_model.build_model_from_featureset(fset, model, None,
                                                    model_options,
                                                    params_to_optimize)
    assert hasattr(model, "best_params_")
    assert hasattr(model, "predict_proba")
    assert isinstance(model, GridSearchCV)
    assert isinstance(model.best_estimator_, RandomForestClassifier)
示例#19
0
def test_fit_existing_model_optimize():
    """Test model building helper function - with param. optimization"""
    fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    model_options = {"criterion": "gini", "bootstrap": True}
    params_to_optimize = {"n_estimators": [10, 50, 100],
                          "min_samples_split": [2, 5],
                          "max_features": ["auto", 3]}
    model = build_model.build_model_from_featureset(fset, model, None,
                                                    model_options,
                                                    params_to_optimize)
    assert hasattr(model, "best_params_")
    assert hasattr(model, "predict_proba")
    assert isinstance(model, GridSearchCV)
    assert isinstance(model.best_estimator_, RandomForestClassifier)
示例#20
0
def test_invalid_feature_values():
    """Test proper exception handling for invalid feature values"""
    fset = sample_featureset(10, 1, ['x_valid', 'x_inf', 'x_nan'], ['class1', 'class2'])
    fset.x_inf.values[0, 0] = np.inf
    fset.x_nan.values[0, 0] = np.nan
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    try:
        model = build_model.build_model_from_featureset(fset, model)
    except ValueError as e:
        assert 'x_valid' not in str(e)
        assert 'x_inf' in str(e)
        assert 'x_nan' in str(e)
    else:
        raise AssertionError("Exception should have been raised for invalid data.")

    model = build_model.build_model_from_featureset(fset.drop(['x_inf', 'x_nan']), model)
    assert isinstance(model, RandomForestClassifier)
示例#21
0
    def add_file(featureset, create, value, *args, **kwargs):
        if not create:
            return

        if 'class' in featureset.name:
            labels = ['Mira', 'Classical_Cepheid']
        elif 'regr' in featureset.name:
            labels = [2.2, 3.4, 4.4, 2.2, 3.1]
        else:
            labels = []
        fset_data, fset_labels = sample_featureset(5, 1,
                                                   featureset.features_list,
                                                   labels)
        fset_path = pjoin(TMP_DIR, '{}.npz'.format(str(uuid.uuid4())))
        featurize.save_featureset(fset_data, fset_path, labels=fset_labels)
        featureset.file_uri = fset_path
        DBSession().commit()
示例#22
0
def test_invalid_feature_values():
    """Test proper exception handling for invalid feature values"""
    fset = sample_featureset(10, 1, ['x_valid', 'x_inf', 'x_nan'], ['class1', 'class2'])
    fset.x_inf.values[0, 0] = np.inf
    fset.x_nan.values[0, 0] = np.nan
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']()
    try:
        model = build_model.build_model_from_featureset(fset, model)
    except ValueError as e:
        assert 'x_valid' not in str(e)
        assert 'x_inf' in str(e)
        assert 'x_nan' in str(e)
    else:
        raise AssertionError("Exception should have been raised for invalid data.")

    model = build_model.build_model_from_featureset(fset.drop(['x_inf', 'x_nan']), model)
    assert isinstance(model, RandomForestClassifier)
示例#23
0
def test_fit_optimize():
    """Test hypeparameter optimization"""
    fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model_options = {"criterion": "gini", "bootstrap": True}
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']\
            (**model_options)
    feature_df = build_model.rectangularize_featureset(fset)
    params_to_optimize = {"n_estimators": [10, 50, 100],
                          "min_samples_split": [2, 5],
                          "max_features": ["auto", 3]}
    model = build_model.fit_model_optimize_hyperparams(feature_df,
                                                       fset['target'], model,
                                                       params_to_optimize)
    assert hasattr(model, "best_params_")
    assert hasattr(model, "predict_proba")
    assert isinstance(model, GridSearchCV)
    assert isinstance(model.best_estimator_, RandomForestClassifier)
示例#24
0
def test_fit_optimize():
    """Test hypeparameter optimization"""
    fset = sample_featureset(10, 1, ['amplitude', 'maximum', 'minimum', 'median'],
                             ['class1', 'class2'])
    model_options = {"criterion": "gini", "bootstrap": True}
    model = build_model.MODELS_TYPE_DICT['RandomForestClassifier']\
            (**model_options)
    feature_df = fset.to_dataframe()
    params_to_optimize = {"n_estimators": [10, 50, 100],
                          "min_samples_split": [2, 5],
                          "max_features": ["auto", 3]}
    model = build_model.fit_model_optimize_hyperparams(feature_df,
                                                       fset['target'], model,
                                                       params_to_optimize)
    assert hasattr(model, "best_params_")
    assert hasattr(model, "predict_proba")
    assert isinstance(model, GridSearchCV)
    assert isinstance(model.best_estimator_, RandomForestClassifier)
示例#25
0
def test_impute():
    """Test imputation of missing Featureset values."""
    fset, labels = sample_featureset(5, 1, ['amplitude'], ['class1', 'class2'],
                                     names=['a', 'b', 'c', 'd', 'e'],
                                     meta_features=['meta1'])

    imputed = featurize.impute_featureset(fset)
    npt.assert_allclose(fset.amplitude.values, imputed.amplitude.values)
    assert isinstance(imputed, pd.DataFrame)

    fset.amplitude.values[0] = np.inf
    fset.amplitude.values[1] = np.nan
    amp_values = fset.amplitude.values[2:]
    other_values = fset.values.T.ravel()[2:]

    imputed = featurize.impute_featureset(fset, strategy='constant',
                                          value=None)
    npt.assert_allclose(-2 * np.nanmax(np.abs(other_values)),
                        imputed.amplitude.values[0:2])

    imputed = featurize.impute_featureset(fset, strategy='constant',
                                          value=-1e4)
    npt.assert_allclose(-1e4, imputed.amplitude.values[0:2])

    imputed = featurize.impute_featureset(fset, strategy='mean')
    npt.assert_allclose(np.mean(amp_values), imputed.amplitude.values[0:2])
    npt.assert_allclose(amp_values, imputed.amplitude.values[2:])

    imputed = featurize.impute_featureset(fset, strategy='median')
    npt.assert_allclose(np.median(amp_values), imputed.amplitude.values[0:2])
    npt.assert_allclose(amp_values, imputed.amplitude.values[2:])

    imputed = featurize.impute_featureset(fset, strategy='most_frequent')
    npt.assert_allclose(scipy.stats.mode(amp_values).mode.item(),
                        imputed.amplitude.values[0:2])
    npt.assert_allclose(amp_values, imputed.amplitude.values[2:])

    featurize.impute_featureset(fset, strategy='constant', value=-1e4,
                                inplace=True)
    npt.assert_allclose(-1e4, fset.amplitude.values[0:2])

    with pytest.raises(NotImplementedError):
        featurize.impute_featureset(fset, strategy='blah')
示例#26
0
def test_impute():
    """Test imputation of missing Featureset values."""
    fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'],
                             labels=['a', 'b', 'c'], meta_features=['meta1'])

    imputed = fset.impute()
    npt.assert_allclose(fset.amplitude.values, imputed.amplitude.values)
    assert isinstance(imputed, Featureset)

    fset.amplitude.values[0, 0] = np.inf
    fset.amplitude.values[0, 1] = np.nan
    masked = Featureset(fset.where(abs(fset) < np.inf))
    values = fset.amplitude.values[0, 2:]

    imputed = fset.impute(strategy='constant', value=None)
    abs_values = np.abs(np.array([v.values.ravel() for v in
                                  masked.data_vars.values()]))
    npt.assert_allclose(-2 * np.nanmax(abs_values),
                        imputed.amplitude.values[0, 0:2])
    assert isinstance(imputed, Featureset)

    imputed = fset.impute(strategy='constant', value=-1e4)
    npt.assert_allclose(-1e4, imputed.amplitude.values[0, 0:2])
    assert isinstance(imputed, Featureset)

    imputed = fset.impute(strategy='mean')
    npt.assert_allclose(np.mean(values), imputed.amplitude.values[0, 0:2])
    npt.assert_allclose(values, imputed.amplitude.values[0, 2:])
    assert isinstance(imputed, Featureset)

    imputed = fset.impute(strategy='median')
    npt.assert_allclose(np.median(values), imputed.amplitude.values[0, 0:2])
    npt.assert_allclose(values, imputed.amplitude.values[0, 2:])
    assert isinstance(imputed, Featureset)

    imputed = fset.impute(strategy='most_frequent')
    npt.assert_allclose(scipy.stats.mode(values).mode.item(),
                        imputed.amplitude.values[0, 0:2])
    npt.assert_allclose(values, imputed.amplitude.values[0, 2:])
    assert isinstance(imputed, Featureset)
示例#27
0
def test_roundtrip_featureset(tmpdir):
    fset_path = os.path.join(str(tmpdir), 'test.npz')
    for n_channels in [1, 3]:
        for labels in [['class1', 'class2'], []]:
            fset, labels = sample_featureset(3, n_channels, ['amplitude'],
                                             labels, names=['a', 'b', 'c'],
                                             meta_features=['meta1'])

            pred_probs = pd.DataFrame(np.random.random((len(fset), 2)),
                                      index=fset.index.values,
                                      columns=['class1', 'class2'])

            featurize.save_featureset(fset, fset_path, labels=labels,
                                      pred_probs=pred_probs)
            fset_loaded, data_loaded = featurize.load_featureset(fset_path)
            npt.assert_allclose(fset.values, fset_loaded.values)
            npt.assert_array_equal(fset.index, fset_loaded.index)
            npt.assert_array_equal(fset.columns, fset_loaded.columns)
            assert isinstance(fset_loaded, pd.DataFrame)
            npt.assert_array_equal(labels, data_loaded['labels'])
            npt.assert_allclose(pred_probs, data_loaded['pred_probs'])
            npt.assert_array_equal(pred_probs.columns,
                                   data_loaded['pred_probs'].columns)
示例#28
0
def test_to_dataframe():
    fset = sample_featureset(3, 1, ['amplitude'], ['class1', 'class2'],
                             labels=['a', 'b', 'c'])
    df = fset.to_dataframe()
    npt.assert_allclose(fset['amplitude'].values.ravel(), df['amplitude'])
    assert 'target' not in df
示例#29
0
def test_repr():
    """Testing Featureset printing."""
    fset = sample_featureset(10, 3, ['amplitude', 'maximum', 'minimum'],
                             ['class1', 'class2'])
    repr(fset)