def test_extra_woodwork_params(es): new_es = EntitySet() sessions_df = es["sessions"].ww.copy() assert sessions_df.ww.index == "id" assert sessions_df.ww.time_index is None assert isinstance(sessions_df.ww.logical_types["id"], Integer) warning_msg = ( "A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: " "index, time_index, logical_types, make_index, semantic_tags, already_sorted" ) with pytest.warns(UserWarning, match=warning_msg): new_es.add_dataframe( dataframe_name="sessions", dataframe=sessions_df, index="filepath", time_index="customer_id", logical_types={"id": Categorical}, make_index=True, already_sorted=True, semantic_tags={"id": "new_tag"}, ) assert sessions_df.ww.index == "id" assert sessions_df.ww.time_index is None assert isinstance(sessions_df.ww.logical_types["id"], Integer) assert "new_tag" not in sessions_df.ww.semantic_tags
def test_init_es_with_relationships(pd_df): second_df = pd.DataFrame({ "id": [0, 1, 2, 3], "first_table_id": [1, 2, 2, 1] }) pd_df.ww.init(name="first_table", index="id") second_df.ww.init(name="second_table", index="id") es = EntitySet( "es", dataframes={ "first_table": (pd_df, ), "second_table": (second_df, ) }, relationships=[("first_table", "id", "second_table", "first_table_id") ], ) assert len(es.relationships) == 1 forward_dataframes = [ name for name, _ in es.get_forward_dataframes("second_table") ] assert forward_dataframes[0] == "first_table" relationship = es.relationships[0] assert "foreign_key" in relationship.child_column.ww.semantic_tags assert "index" in relationship.parent_column.ww.semantic_tags
def test_init_es_with_relationships(pd_df): second_df = pd.DataFrame({ 'id': [0, 1, 2, 3], 'first_table_id': [1, 2, 2, 1] }) pd_df.ww.init(name='first_table', index='id') second_df.ww.init(name='second_table', index='id') es = EntitySet('es', dataframes={ 'first_table': (pd_df, ), 'second_table': (second_df, ) }, relationships=[('first_table', 'id', 'second_table', 'first_table_id')]) assert len(es.relationships) == 1 forward_dataframes = [ name for name, _ in es.get_forward_dataframes('second_table') ] assert forward_dataframes[0] == 'first_table' relationship = es.relationships[0] assert 'foreign_key' in relationship.child_column.ww.semantic_tags assert 'index' in relationship.parent_column.ww.semantic_tags
def test_extra_woodwork_params(es): new_es = EntitySet() sessions_df = es['sessions'].ww.copy() assert sessions_df.ww.index == 'id' assert sessions_df.ww.time_index is None assert isinstance(sessions_df.ww.logical_types['id'], Integer) warning_msg = ( 'A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: ' 'index, time_index, logical_types, make_index, semantic_tags, already_sorted' ) with pytest.warns(UserWarning, match=warning_msg): new_es.add_dataframe(dataframe_name='sessions', dataframe=sessions_df, index='filepath', time_index='customer_id', logical_types={'id': Categorical}, make_index=True, already_sorted=True, semantic_tags={'id': 'new_tag'}) assert sessions_df.ww.index == 'id' assert sessions_df.ww.time_index is None assert isinstance(sessions_df.ww.logical_types['id'], Integer) assert 'new_tag' not in sessions_df.ww.semantic_tags
def test_dataframe_without_name(es): new_es = EntitySet() new_df = es['sessions'].copy() assert new_df.ww.schema is None error = 'Cannot add dataframe to EntitySet without a name. Please provide a value for the dataframe_name parameter.' with pytest.raises(ValueError, match=error): new_es.add_dataframe(new_df)
def test_woodwork_dataframe_same_name_parameter(es): new_es = EntitySet() new_df = es['sessions'].ww.copy() new_df.ww._schema.name = 'df_name' assert new_df.ww.name == 'df_name' new_es.add_dataframe(new_df, dataframe_name='df_name') assert new_es['df_name'].ww.name == 'df_name'
def test_woodwork_dataframe_without_name_errors(es): new_es = EntitySet() new_df = es['sessions'].ww.copy() new_df.ww._schema.name = None assert new_df.ww.name is None error = 'Cannot add a Woodwork DataFrame to EntitySet without a name' with pytest.raises(ValueError, match=error): new_es.add_dataframe(new_df)
def test_woodwork_dataframe_same_name_parameter(es): new_es = EntitySet() new_df = es["sessions"].ww.copy() new_df.ww._schema.name = "df_name" assert new_df.ww.name == "df_name" new_es.add_dataframe(new_df, dataframe_name="df_name") assert new_es["df_name"].ww.name == "df_name"
def test_add_secondary_time_index(dates_df): dates_df.ww.init(name="dates_table", index="backwards_order", time_index="dates_backwards") es = EntitySet("es") es.add_dataframe( dates_df, secondary_time_index={"repeating_dates": ["random_order", "special"]}) assert dates_df.ww.metadata["secondary_time_index"] == { "repeating_dates": ["random_order", "special", "repeating_dates"] }
def test_dataframe_with_name_parameter(es): new_es = EntitySet() new_df = es["sessions"][["id"]] assert new_df.ww.schema is None new_es.add_dataframe(new_df, dataframe_name="df_name", index="id", logical_types={"id": "Integer"}) assert new_es["df_name"].ww.name == "df_name"
def test_dataframe_with_name_parameter(es): new_es = EntitySet() new_df = es['sessions'][['id']] assert new_df.ww.schema is None new_es.add_dataframe(new_df, dataframe_name='df_name', index='id', logical_types={'id': 'Integer'}) assert new_es['df_name'].ww.name == 'df_name'
def test_add_secondary_time_index(dates_df): dates_df.ww.init(name='dates_table', index='backwards_order', time_index='dates_backwards') es = EntitySet('es') es.add_dataframe( dates_df, secondary_time_index={'repeating_dates': ['random_order', 'special']}) assert dates_df.ww.metadata['secondary_time_index'] == { 'repeating_dates': ['random_order', 'special', 'repeating_dates'] }
def test_woodwork_dataframe_ignore_conflicting_name_parameter_warning(es): new_es = EntitySet() new_df = es['sessions'].ww.copy() new_df.ww._schema.name = 'df_name' assert new_df.ww.name == 'df_name' warning = 'A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: dataframe_name' with pytest.warns(UserWarning, match=warning): new_es.add_dataframe(new_df, dataframe_name='conflicting_name') assert new_es['df_name'].ww.name == 'df_name'
def test_time_type_check_order(dates_df): dates_df.ww.init(name='dates_table', index='backwards_order', time_index='random_order') es = EntitySet('es') error = 'dates_table time index is Datetime type which differs from other entityset time indexes' with pytest.raises(TypeError, match=error): es.add_dataframe(dates_df, secondary_time_index={ 'repeating_dates': ['random_order', 'special'] }) assert 'secondary_time_index' not in dates_df.ww.metadata
def test_init_es_with_multiple_dataframes(pd_df): second_df = pd.DataFrame({ "id": [0, 1, 2, 3], "first_table_id": [1, 2, 2, 1] }) pd_df.ww.init(name="first_table", index="id") es = EntitySet( "es", dataframes={ "first_table": (pd_df, ), "second_table": ( second_df, "id", None, None, { "first_table_id": "foreign_key" }, ), }, ) assert len(es.dataframe_dict) == 2 assert es["first_table"].ww.schema is not None assert es["second_table"].ww.schema is not None
def test_time_type_check_order(dates_df): dates_df.ww.init(name="dates_table", index="backwards_order", time_index="random_order") es = EntitySet("es") error = "dates_table time index is Datetime type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error): es.add_dataframe( dates_df, secondary_time_index={ "repeating_dates": ["random_order", "special"] }, ) assert "secondary_time_index" not in dates_df.ww.metadata
def read_entityset(path, load_data=True): from featuretools.entityset.entityset import EntitySet data_root = os.path.abspath(os.path.expanduser(path)) with open(os.path.join(data_root, 'metadata.json')) as f: metadata = json.load(f) if not load_data: data_root = None return EntitySet.from_metadata(metadata, data_root=data_root)
def test_change_es_dataframe_schema(df): df.ww.init(index="id", name="table") es = EntitySet("es", dataframes={"table": (df, )}) assert es["table"].ww.index == "id" es["table"].ww.set_index("category") assert es["table"].ww.index == "category"
def test_change_es_dataframe_schema(df): df.ww.init(index='id', name='table') es = EntitySet('es', dataframes={'table': (df, )}) assert es['table'].ww.index == 'id' es['table'].ww.set_index('category') assert es['table'].ww.index == 'category'
def test_init_es_with_dataframe(df): es = EntitySet('es', dataframes={'table': (df, 'id')}) assert es.id == 'es' assert len(es.dataframe_dict) == 1 assert es['table'] is df assert es['table'].ww.schema is not None assert isinstance(es['table'].ww.logical_types['id'], Integer) assert isinstance(es['table'].ww.logical_types['category'], Categorical)
def test_init_es_with_dataframe(df): es = EntitySet("es", dataframes={"table": (df, "id")}) assert es.id == "es" assert len(es.dataframe_dict) == 1 assert es["table"] is df assert es["table"].ww.schema is not None assert isinstance(es["table"].ww.logical_types["id"], Integer) assert isinstance(es["table"].ww.logical_types["category"], Categorical)
def test_int_double_time_type(dates_df): dates_df.ww.init(name='dates_table', index='backwards_order', time_index='random_order', logical_types={ 'random_order': 'Integer', 'special': 'Double' }) es = EntitySet('es') # Both random_order and special are numeric, but they are different logical types es.add_dataframe(dates_df, secondary_time_index={'special': ['dates_backwards']}) assert isinstance(es['dates_table'].ww.logical_types['random_order'], Integer) assert isinstance(es['dates_table'].ww.logical_types['special'], Double) assert es['dates_table'].ww.time_index == 'random_order' assert 'special' in es['dates_table'].ww.metadata['secondary_time_index']
def test_normalize_ww_init(): es = EntitySet() df = pd.DataFrame({ 'id': [1, 2, 3, 4], 'col': ['a', 'b', 'c', 'd'], 'df2_id': [1, 1, 2, 2], 'df2_col': [True, False, True, True] }) df.ww.init(index='id', name='test_name') es.add_dataframe(dataframe=df) assert es['test_name'].ww.name == 'test_name' assert es['test_name'].ww.schema.name == 'test_name' es.normalize_dataframe('test_name', 'new_df', 'df2_id', additional_columns=['df2_col']) assert es['test_name'].ww.name == 'test_name' assert es['test_name'].ww.schema.name == 'test_name' assert es['new_df'].ww.name == 'new_df' assert es['new_df'].ww.schema.name == 'new_df'
def test_replace_dataframe_different_dataframe_types(): dask_es = EntitySet(id="dask_es") sessions = pd.DataFrame({ "id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) sessions_dask = dd.from_pandas(sessions, npartitions=2) sessions_logical_types = { "id": Integer, "user": Integer, "time": Datetime, "strings": NaturalLanguage } sessions_semantic_tags = {'user': '******'} dask_es.add_dataframe(dataframe_name="sessions", dataframe=sessions_dask, index="id", time_index="time", logical_types=sessions_logical_types, semantic_tags=sessions_semantic_tags) with pytest.raises(TypeError, match='Incorrect DataFrame type used'): dask_es.replace_dataframe('sessions', sessions)
def test_replace_dataframe_data_transformation(latlong_df): initial_df = latlong_df.copy() initial_df.ww.init( name='latlongs', index='string_tuple', logical_types={col_name: 'LatLong' for col_name in initial_df.columns}) es = EntitySet() es.add_dataframe(dataframe=initial_df) df = to_pandas(es['latlongs']) expected_val = (1, 2) if ks and isinstance(es['latlongs'], ks.DataFrame): expected_val = [1, 2] for col in latlong_df.columns: series = df[col] assert series.iloc[0] == expected_val es.replace_dataframe('latlongs', latlong_df) df = to_pandas(es['latlongs']) expected_val = (3, 4) if ks and isinstance(es['latlongs'], ks.DataFrame): expected_val = [3, 4] for col in latlong_df.columns: series = df[col] assert series.iloc[-1] == expected_val
def test_normalize_ww_init(): es = EntitySet() df = pd.DataFrame({ "id": [1, 2, 3, 4], "col": ["a", "b", "c", "d"], "df2_id": [1, 1, 2, 2], "df2_col": [True, False, True, True], }) df.ww.init(index="id", name="test_name") es.add_dataframe(dataframe=df) assert es["test_name"].ww.name == "test_name" assert es["test_name"].ww.schema.name == "test_name" es.normalize_dataframe("test_name", "new_df", "df2_id", additional_columns=["df2_col"]) assert es["test_name"].ww.name == "test_name" assert es["test_name"].ww.schema.name == "test_name" assert es["new_df"].ww.name == "new_df" assert es["new_df"].ww.schema.name == "new_df"
def test_normalize_dataframe(): df = pd.DataFrame({ 'id': range(4), 'full_name': ['Mr. John Doe', 'Doe, Mrs. Jane', 'James Brown', 'Ms. Paige Turner'], 'email': [ '*****@*****.**', np.nan, '*****@*****.**', '*****@*****.**' ], 'phone_number': ['5555555555', '555-555-5555', '1-(555)-555-5555', '555-555-5555'], 'age': pd.Series([33, None, 33, 57], dtype='Int64'), 'signup_date': [pd.to_datetime('2020-09-01')] * 4, 'is_registered': pd.Series([True, False, True, None], dtype='boolean'), }) df.ww.init(name='first_table', index='id', time_index='signup_date') es = EntitySet('es') es.add_dataframe(df) es.normalize_dataframe('first_table', 'second_table', 'age', additional_columns=['phone_number', 'full_name'], make_time_index=True) assert len(es.dataframe_dict) == 2 assert 'foreign_key' in es['first_table'].ww.semantic_tags['age']
def test_replace_dataframe(): df = pd.DataFrame({ 'id': range(4), 'full_name': ['Mr. John Doe', 'Doe, Mrs. Jane', 'James Brown', 'Ms. Paige Turner'], 'email': [ '*****@*****.**', np.nan, '*****@*****.**', '*****@*****.**' ], 'phone_number': ['5555555555', '555-555-5555', '1-(555)-555-5555', '555-555-5555'], 'age': pd.Series([33, None, 33, 57], dtype='Int64'), 'signup_date': [pd.to_datetime('2020-09-01')] * 4, 'is_registered': pd.Series([True, False, True, None], dtype='boolean'), }) df.ww.init(name='table', index='id') es = EntitySet('es') es.add_dataframe(df) original_schema = es['table'].ww.schema new_df = df.iloc[2:] es.replace_dataframe('table', new_df) assert len(es['table']) == 2 assert es['table'].ww.schema == original_schema
def test_int_double_time_type(dates_df): dates_df.ww.init( name="dates_table", index="backwards_order", time_index="random_order", logical_types={ "random_order": "Integer", "special": "Double" }, ) es = EntitySet("es") # Both random_order and special are numeric, but they are different logical types es.add_dataframe(dates_df, secondary_time_index={"special": ["dates_backwards"]}) assert isinstance(es["dates_table"].ww.logical_types["random_order"], Integer) assert isinstance(es["dates_table"].ww.logical_types["special"], Double) assert es["dates_table"].ww.time_index == "random_order" assert "special" in es["dates_table"].ww.metadata["secondary_time_index"]
def test_init_es_with_woodwork_table_same_name(df): df.ww.init(index='id', name='table') es = EntitySet('es', dataframes={'table': (df, )}) assert es.id == 'es' assert len(es.dataframe_dict) == 1 assert es['table'] is df assert es['table'].ww.schema is not None assert es['table'].ww.index == 'id' assert es['table'].ww.time_index is None assert isinstance(es['table'].ww.logical_types['id'], Integer) assert isinstance(es['table'].ww.logical_types['category'], Categorical)