def test_replace_dataframe_different_dataframe_types(): dask_es = EntitySet(id="dask_es") sessions = pd.DataFrame({ "id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) sessions_dask = dd.from_pandas(sessions, npartitions=2) sessions_logical_types = { "id": Integer, "user": Integer, "time": Datetime, "strings": NaturalLanguage } sessions_semantic_tags = {'user': '******'} dask_es.add_dataframe(dataframe_name="sessions", dataframe=sessions_dask, index="id", time_index="time", logical_types=sessions_logical_types, semantic_tags=sessions_semantic_tags) with pytest.raises(TypeError, match='Incorrect DataFrame type used'): dask_es.replace_dataframe('sessions', sessions)
def test_replace_dataframe(): df = pd.DataFrame({ 'id': range(4), 'full_name': ['Mr. John Doe', 'Doe, Mrs. Jane', 'James Brown', 'Ms. Paige Turner'], 'email': [ '*****@*****.**', np.nan, '*****@*****.**', '*****@*****.**' ], 'phone_number': ['5555555555', '555-555-5555', '1-(555)-555-5555', '555-555-5555'], 'age': pd.Series([33, None, 33, 57], dtype='Int64'), 'signup_date': [pd.to_datetime('2020-09-01')] * 4, 'is_registered': pd.Series([True, False, True, None], dtype='boolean'), }) df.ww.init(name='table', index='id') es = EntitySet('es') es.add_dataframe(df) original_schema = es['table'].ww.schema new_df = df.iloc[2:] es.replace_dataframe('table', new_df) assert len(es['table']) == 2 assert es['table'].ww.schema == original_schema
def test_replace_dataframe_data_transformation(latlong_df): initial_df = latlong_df.copy() initial_df.ww.init( name='latlongs', index='string_tuple', logical_types={col_name: 'LatLong' for col_name in initial_df.columns}) es = EntitySet() es.add_dataframe(dataframe=initial_df) df = to_pandas(es['latlongs']) expected_val = (1, 2) if ks and isinstance(es['latlongs'], ks.DataFrame): expected_val = [1, 2] for col in latlong_df.columns: series = df[col] assert series.iloc[0] == expected_val es.replace_dataframe('latlongs', latlong_df) df = to_pandas(es['latlongs']) expected_val = (3, 4) if ks and isinstance(es['latlongs'], ks.DataFrame): expected_val = [3, 4] for col in latlong_df.columns: series = df[col] assert series.iloc[-1] == expected_val
def test_replace_dataframe(): df = pd.DataFrame({ "id": range(4), "full_name": [ "Mr. John Doe", "Doe, Mrs. Jane", "James Brown", "Ms. Paige Turner", ], "email": [ "*****@*****.**", np.nan, "*****@*****.**", "*****@*****.**", ], "phone_number": [ "5555555555", "555-555-5555", "1-(555)-555-5555", "555-555-5555", ], "age": pd.Series([33, None, 33, 57], dtype="Int64"), "signup_date": [pd.to_datetime("2020-09-01")] * 4, "is_registered": pd.Series([True, False, True, None], dtype="boolean"), }) df.ww.init(name="table", index="id") es = EntitySet("es") es.add_dataframe(df) original_schema = es["table"].ww.schema new_df = df.iloc[2:] es.replace_dataframe("table", new_df) assert len(es["table"]) == 2 assert es["table"].ww.schema == original_schema