def test_datetime_featurizer_fit_transform(): datetime_transformer = DateTimeFeaturizer(features_to_extract=["year"]) X = pd.DataFrame({'Numerical 1': range(20), 'Date Col 1': pd.date_range('2020-05-19', periods=20, freq='D'), 'Date Col 2': pd.date_range('2020-02-03', periods=20, freq='W'), 'Numerical 2': [0] * 20}) transformed = datetime_transformer.fit_transform(X) assert list(transformed.columns) == ['Numerical 1', 'Numerical 2', 'Date Col 1_year', 'Date Col 2_year'] assert transformed["Date Col 1_year"].equals(pd.Series([2020] * 20)) assert transformed["Date Col 2_year"].equals(pd.Series([2020] * 20)) assert datetime_transformer.get_feature_names() == {}
def test_datetime_featurizer_encodes_as_ints(): X = pd.DataFrame({"date": ["2016-04-10 16:10:09", "2017-03-15 13:32:05", "2018-07-10 07:15:10", "2019-08-19 20:20:20", "2020-01-03 06:45:12"]}) dt = DateTimeFeaturizer() X_transformed_df = dt.fit_transform(X) expected = pd.DataFrame({"date_year": pd.Series([2016, 2017, 2018, 2019, 2020], dtype="Int64"), "date_month": pd.Series([3, 2, 6, 7, 0], dtype="Int64"), "date_day_of_week": pd.Series([0, 3, 2, 1, 5], dtype="Int64"), "date_hour": pd.Series([16, 13, 7, 20, 6], dtype="Int64")}) feature_names = {'date_month': {'April': 3, 'March': 2, 'July': 6, 'August': 7, 'January': 0}, 'date_day_of_week': {'Sunday': 0, 'Wednesday': 3, 'Tuesday': 2, 'Monday': 1, 'Friday': 5} } assert_frame_equal(expected, X_transformed_df.to_dataframe()) assert dt.get_feature_names() == feature_names # Test that changing encode_as_categories to True only changes the dtypes but not the values dt_with_cats = DateTimeFeaturizer(encode_as_categories=True) X_transformed_df = dt_with_cats.fit_transform(X) expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0]) expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5]) assert_frame_equal(expected, X_transformed_df.to_dataframe()) assert dt_with_cats.get_feature_names() == feature_names # Test that sequential calls to the same DateTimeFeaturizer work as expected by using the first dt we defined X = pd.DataFrame({"date": ["2020-04-10", "2017-03-15", "2019-08-19"]}) X_transformed_df = dt.fit_transform(X) expected = pd.DataFrame({"date_year": pd.Series([2020, 2017, 2019], dtype="Int64"), "date_month": pd.Series([3, 2, 7], dtype="Int64"), "date_day_of_week": pd.Series([5, 3, 1], dtype="Int64"), "date_hour": pd.Series([0, 0, 0], dtype="Int64")}) assert_frame_equal(expected, X_transformed_df.to_dataframe()) assert dt.get_feature_names() == {'date_month': {'April': 3, 'March': 2, 'August': 7}, 'date_day_of_week': {'Friday': 5, 'Wednesday': 3, 'Monday': 1}} dt = DateTimeFeaturizer(features_to_extract=["year", "hour"]) dt.fit_transform(X) assert dt.get_feature_names() == {}