Пример #1
0
def test_refit(data_raw, data_raw_2):
    expander = DataFrameETL(cols_to_drop=['pid', 'age', 'djinn_type'],
                            cols_to_expand=['fruits', 'animal'],
                            dataframe_output=True,
                            dummy_na='expanded')
    expander.fit(data_raw)
    df = expander.transform(data_raw)
    df_expected = pd.concat([
        pd.Series([1., 0., 0.], dtype='float32', name='fruits_1.0'),
        pd.Series([0., 0., 1.], dtype='float32', name='fruits_3.0'),
        pd.Series([0., 1., 0.], dtype='float32', name='fruits_NaN'),
        pd.Series([1., 0., 0.], dtype='float32', name='animal_cat'),
        pd.Series([0., 1., 0.], dtype='float32', name='animal_dog'),
        pd.Series([0., 0., 1.], dtype='float32', name='animal_NaN'),
    ],
                            axis=1)
    assert df.equals(df_expected)

    expander.fit(data_raw_2)
    df2 = expander.transform(data_raw_2)
    df_expected_2 = pd.concat([
        pd.Series([1., 0., 0.], dtype='float32', name='fruits_-99999.0'),
        pd.Series([0., 0., 1.], dtype='float32', name='fruits_1.0'),
        pd.Series([0., 1., 0.], dtype='float32', name='fruits_NaN'),
        pd.Series([1., 0., 0.], dtype='float32', name='animal_cat'),
        pd.Series([0., 1., 0.], dtype='float32', name='animal_dog'),
        pd.Series([0., 0., 1.], dtype='float32', name='animal_NaN'),
    ],
                              axis=1)
    assert df2.equals(df_expected_2)
Пример #2
0
def test_na_in_transform_but_not_fit_all():
    fit_df = pd.concat([
        pd.Series(
            ['marid', 'effrit', 'sila'], dtype='object', name='djinn_type'),
        pd.Series([1.0, 2.0, 3.0], dtype='float', name='fruits'),
    ],
                       axis=1)

    expander = DataFrameETL(cols_to_expand=['djinn_type'],
                            dummy_na='all',
                            dataframe_output=True)
    expander.fit(fit_df)

    # Add nans in the first row of pid and djinn_type for transforming
    transform_df = pd.concat([
        pd.Series(
            [np.nan, 'effrit', 'sila'], dtype='object', name='djinn_type'),
        pd.Series([np.nan, 2.0, 3.0], dtype='float', name='fruits'),
    ],
                             axis=1)
    df = expander.transform(transform_df)

    df_expected = pd.concat([
        pd.Series([0., 1., 0.], dtype='float32', name='djinn_type_effrit'),
        pd.Series([0., 0., 0.], dtype='float32', name='djinn_type_marid'),
        pd.Series([0., 0., 1.], dtype='float32', name='djinn_type_sila'),
        pd.Series([np.nan, 2.0, 3.0], dtype='float32', name='fruits'),
    ],
                            axis=1)

    assert df.equals(df_expected)
Пример #3
0
def test_transform_reuse_transformer(data_raw, data_raw_2,
                                     dataframe_2_expected):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits'],
                            cols_to_drop=['djinn_type', 'age', 'animal'],
                            dummy_na=True,
                            dataframe_output=True)
    expander.fit(data_raw)
    df = expander.transform(data_raw_2)
    assert df.equals(dataframe_2_expected)
Пример #4
0
def test_transform_two_levels(data_few_levels, few_levels_expected):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'],
                            dummy_na='expanded',
                            fill_value=99.,
                            dataframe_output=True)
    expander.fit(data_few_levels)
    df = expander.transform(data_few_levels)
    assert df.shape == few_levels_expected.shape
    assert df.equals(few_levels_expected)
Пример #5
0
def test_transform_dataframe(data_raw, dataframe_expected):
    expander = DataFrameETL(cols_to_drop=['pid'],
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na='expanded',
                            dataframe_output=True)
    expander.fit(data_raw)
    df = expander.transform(data_raw)
    assert df.shape == dataframe_expected.shape
    assert df.equals(dataframe_expected)
Пример #6
0
def test_transform(data_raw, dataframe_expected):
    expander = DataFrameETL(cols_to_drop=['pid'],
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na='expanded')
    expander.fit(data_raw)
    arr = expander.transform(data_raw)

    expected_array = np.asarray(dataframe_expected)
    assert arr.shape == expected_array.shape
    assert_almost_equal(arr, expected_array)
Пример #7
0
def test_transform_preserve_col_order(data_raw, data_raw_2,
                                      dataframe_2_expected):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits'],
                            cols_to_drop=['djinn_type', 'age', 'animal'],
                            dummy_na='expanded',
                            dataframe_output=True)
    expander.fit(data_raw)
    # swap col order for second data file
    data_raw_2 = data_raw_2[['fruits', 'age', 'djinn_type', 'pid', 'animal']]
    df = expander.transform(data_raw_2)
    assert df.equals(dataframe_2_expected)
Пример #8
0
def test_transform_no_level_overlap():
    df = pd.concat([
        pd.Series([1.0, np.NaN, 3.0], dtype='float', name='fruits'),
        pd.Series(["2000", "2500", "3000"], dtype='object', name='age')
    ],
                   axis=1)

    expander = DataFrameETL(cols_to_expand=['age'], dummy_na=False)
    expander.fit(df)

    # the dtype mismatch will cause incorrect categorical expansion;
    # DataFrameETL should issue a warning
    df['age'] = df['age'].astype('int')

    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always')
        expander.transform(df)
        assert len(w) == 1
        msg = "No overlap between levels in column 'age' and " + \
              "levels seen during fit"
        assert msg in w[0].message.args[0]
Пример #9
0
def test_transform_two_levels_no_dummy(data_few_levels, few_levels_expected):
    expander = DataFrameETL(cols_to_expand=['pid', 'fruits', 'animal'],
                            dummy_na=False,
                            fill_value=99.,
                            dataframe_output=True)
    expander.fit(data_few_levels)
    df = expander.transform(data_few_levels)
    few_levels_expected.pop('pid_NaN_Sentinel')
    few_levels_expected.pop('fruits_NaN')
    few_levels_expected.pop('animal_NaN')
    assert df.shape == few_levels_expected.shape
    assert df.equals(few_levels_expected)
Пример #10
0
def test_transform_dataframe_no_dummy(data_raw, dataframe_expected):
    expander = DataFrameETL(cols_to_drop=['pid'],
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na=False,
                            dataframe_output=True)
    expander.fit(data_raw)
    df = expander.transform(data_raw)

    # drop nan columns from expected data
    dataframe_expected.pop('djinn_type_NaN')
    dataframe_expected.pop('fruits_NaN')
    dataframe_expected.pop('animal_NaN')
    assert df.shape == dataframe_expected.shape
    assert df.equals(dataframe_expected)
Пример #11
0
def test_transform_no_dummy(data_raw, dataframe_expected):
    expander = DataFrameETL(cols_to_drop=['pid'],
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na=False)
    expander.fit(data_raw)
    arr = expander.transform(data_raw)

    # drop nan columns from expected data
    dataframe_expected.pop('djinn_type_NaN')
    dataframe_expected.pop('fruits_NaN')
    dataframe_expected.pop('animal_NaN')
    expected_array = np.asarray(dataframe_expected)
    assert arr.shape == expected_array.shape
    assert_almost_equal(arr, expected_array)
Пример #12
0
def test_pickle(data_raw):
    expander = DataFrameETL(cols_to_drop=['pid'],
                            cols_to_expand=['djinn_type', 'fruits', 'animal'],
                            dummy_na='all')
    expected_array = expander.fit_transform(data_raw)
    # pickle the transformer
    buff = io.BytesIO()
    pickle.dump(expander, buff)
    buff.seek(0)
    # transform data after unpickling transformer
    expander = pickle.load(buff)

    arr = expander.transform(data_raw)
    assert arr.shape == expected_array.shape
    assert_almost_equal(arr, expected_array)
Пример #13
0
def test_expand_all_na(data_raw):
    # output df should only have nan columns for columns which
    # actually had nans during fit
    df_expected = pd.concat([
        pd.Series([0., 1., 0.], dtype='float32', name='djinn_type_effrit'),
        pd.Series([1., 0., 0.], dtype='float32', name='djinn_type_marid'),
        pd.Series([0., 0., 1.], dtype='float32', name='djinn_type_sila'),
        pd.Series([1., np.nan, 3.], dtype='float32', name='fruits'),
        pd.Series([0., 1., 0.], dtype='float32', name='fruits_NaN'),
        pd.Series([1., 0., 0.], dtype='float32', name='animal_cat'),
        pd.Series([0., 1., 0.], dtype='float32', name='animal_dog'),
        pd.Series([0., 0., 1.], dtype='float32', name='animal_NaN'),
    ],
                            axis=1)
    expander = DataFrameETL(cols_to_expand=['djinn_type', 'animal'],
                            cols_to_drop=['pid', 'age'],
                            dummy_na='all',
                            dataframe_output=True)
    expander.fit(data_raw)
    assert expander._unexpanded_nans == {'fruits': True}

    df_out = expander.transform(data_raw)
    assert df_out.equals(df_expected)
Пример #14
0
def test_transform_notfitted(data_raw):
    expander = DataFrameETL()
    with pytest.raises(NotFittedError):
        expander.transform(data_raw)