Пример #1
0
def test_with_feature_attachment():
    X, y = make_classification_df(n_num_features=5, class_sep=0.7)

    params = {
        'objective': 'binary',
        'max_depth': 8
    }

    with get_temp_directory() as temp_feature_path:
        cols = list(X.columns)
        for i, c in enumerate(cols):
            if X.shape[1] == 1:
                break
            save_feature(X[[c]], i, directory=temp_feature_path)
            X.drop(c, axis=1, inplace=True)

        X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)

        with get_temp_directory() as temp_path:
            result_wo_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path)

        with get_temp_directory() as temp_path:
            result_w_feature = run_experiment(params, X_train, y_train, X_test, logging_directory=temp_path,
                                              feature_list=[0, 1, 2, 3], feature_directory=temp_feature_path)

        assert result_w_feature.metrics[-1] > result_wo_feature.metrics[-1]
Пример #2
0
def test_feature_exists():
    df = pd.DataFrame({
        'a': [1, 2, 3, 4, 5] + [None] * 5
    })

    with get_temp_directory() as tmp:
        fs.save_feature(df[['a']], 0, directory=tmp)
        with pytest.raises(RuntimeError):
            fs.save_feature(df, 0, overwrite=False, directory=tmp)
Пример #3
0
def test_save_feature():
    df = pd.DataFrame()

    df['a'] = np.arange(100)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        assert os.path.exists(os.path.join(tmp, '0.f'))
Пример #4
0
def test_load_feature():
    df = pd.DataFrame()

    df['a'] = np.arange(100)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        df_loaded = fs.load_feature(0, tmp)
        assert_frame_equal(df, df_loaded)
Пример #5
0
def test_load_features():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(int)

    with get_temp_directory() as tmp:
        fs.save_feature(df[['b']], 0, tmp)
        fs.save_feature(df[['c']], 1, tmp)

        df_loaded = fs.load_features(df[['a']], [0, 1], tmp)
        assert_frame_equal(df, df_loaded)
Пример #6
0
def test_load_feature_ignore_all_columns():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(int)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        df_loaded = fs.load_feature(0, tmp, ignore_columns=['a', 'b', 'c', 'X'])

        assert_frame_equal(df_loaded, df.drop(['a', 'b', 'c'], axis=1))
Пример #7
0
def test_various_dtypes():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(np.uint8)
    df['d'] = np.arange(100).astype(np.uint16)
    df['e'] = np.arange(100).astype(np.uint32)
    df['f'] = np.arange(100).astype(np.int8)
    df['g'] = np.arange(100).astype(np.int16)
    df['h'] = np.arange(100).astype(np.int32)
    df['i'] = np.arange(100).astype(np.int64)

    with get_temp_directory() as tmp:
        fs.save_feature(df, 0, tmp)

        df_loaded = fs.load_feature(0, tmp)
        assert_frame_equal(df, df_loaded)
Пример #8
0
def test_load_features_no_base():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(int)

    with get_temp_directory() as tmp:
        fs.save_feature(df[['b']], 0, tmp)
        fs.save_feature(df[['c']], 1, tmp)
        fs.save_feature(df[['a']], '2', tmp)

        df_loaded = fs.load_features(None, [0, 1, '2'], tmp)
        assert list(df_loaded.columns) == ['b', 'c', 'a']
Пример #9
0
def test_invalid_feature():
    df = pd.DataFrame({
        'a': [1, 2, 3, 4, 5] + [None] * 5,
        'b': np.random.randint(0, 10, size=10)
    })
    y = pd.Series([1, 0, 1, 0, 1])

    with get_temp_directory() as tmp:
        with pytest.raises(RuntimeError):
            fs.save_feature(df[['a']], 0, reference_target_variable=y, directory=tmp)
        with pytest.raises(RuntimeError):
            fs.save_feature(df, 0, reference_target_variable=y, directory=tmp)

        # ok
        fs.save_feature(df[['b']], 0, reference_target_variable=y, directory=tmp)
Пример #10
0
def test_load_features_duplicate_col_name():
    df = pd.DataFrame()

    df['a'] = np.arange(100).astype(float)
    df['b'] = np.arange(100).astype(int)
    df['c'] = np.arange(100).astype(int)

    with get_temp_directory() as tmp:
        fs.save_feature(df[['a', 'b']], 0, tmp)
        fs.save_feature(df[['b', 'c']], 1, tmp)
        fs.save_feature(df[['b', 'a']], 'X', tmp)

        df_loaded = fs.load_features(None, [0, 1, 'X'], tmp, rename_duplicate=True)
        assert list(df_loaded.columns) == ['a', 'b', 'b_1', 'c', 'b_X', 'a_X']

        df_loaded = fs.load_features(None, [0, 1, 'X'], tmp, rename_duplicate=False)
        assert list(df_loaded.columns) == ['a', 'b', 'b', 'c', 'b', 'a']