def description_to_entityset(description, **kwargs): '''Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. ''' check_schema_version(description, 'entityset') from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get('path') entityset = EntitySet(description['id']) last_time_index = [] for entity in description['entities'].values(): entity['loading_info']['params'].update(kwargs) # If path is None, an empty dataframe will be created for entity. description_to_entity(entity, entityset, path=path) if entity['properties']['last_time_index']: last_time_index.append(entity['id']) for relationship in description['relationships']: relationship = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship) if len(last_time_index): entityset.add_last_time_indexes(updated_entities=last_time_index) return entityset
def test_add_last_time_indexes(): pd_es = EntitySet(id="pd_es") dask_es = EntitySet(id="dask_es") sessions = pd.DataFrame({"id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25')], "strings": ["I am a string", "23", "abcdef ghijk", ""]}) sessions_dask = dd.from_pandas(sessions, npartitions=2) sessions_vtypes = { "id": ft.variable_types.Id, "user": ft.variable_types.Id, "time": ft.variable_types.DatetimeTimeIndex, "strings": ft.variable_types.NaturalLanguage } transactions = pd.DataFrame({"id": [0, 1, 2, 3, 4, 5], "session_id": [0, 0, 1, 2, 2, 3], "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], "time": [pd.to_datetime('2019-01-10 03:53'), pd.to_datetime('2019-01-10 04:12'), pd.to_datetime('2019-02-03 10:34'), pd.to_datetime('2019-01-01 12:35'), pd.to_datetime('2019-01-01 12:49'), pd.to_datetime('2017-08-25 04:53')]}) transactions_dask = dd.from_pandas(transactions, npartitions=2) transactions_vtypes = { "id": ft.variable_types.Id, "session_id": ft.variable_types.Id, "amount": ft.variable_types.Numeric, "time": ft.variable_types.DatetimeTimeIndex, } pd_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions, index="id", time_index="time") dask_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions_dask, index="id", time_index="time", variable_types=sessions_vtypes) pd_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions, index="id", time_index="time") dask_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_dask, index="id", time_index="time", variable_types=transactions_vtypes) new_rel = Relationship(pd_es["sessions"]["id"], pd_es["transactions"]["session_id"]) dask_rel = Relationship(dask_es["sessions"]["id"], dask_es["transactions"]["session_id"]) pd_es = pd_es.add_relationship(new_rel) dask_es = dask_es.add_relationship(dask_rel) assert pd_es['sessions'].last_time_index is None assert dask_es['sessions'].last_time_index is None pd_es.add_last_time_indexes() dask_es.add_last_time_indexes() pd.testing.assert_series_equal(pd_es['sessions'].last_time_index.sort_index(), dask_es['sessions'].last_time_index.compute(), check_names=False)
def description_to_entityset(description, **kwargs): '''Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. ''' check_schema_version(description, 'entityset') from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get('path') entityset = EntitySet(description['id']) for df in description['dataframes'].values(): if path is not None: data_path = os.path.join(path, 'data', df['name']) dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) for relationship in description['relationships']: rel = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship=rel) return entityset
def datetime_es(): cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame( { "id": [1, 2, 3, 4, 5], "card_id": [1, 1, 5, 1, 5], "transaction_time": pd.to_datetime( [ "2011-2-28 04:00", "2012-2-28 05:00", "2012-2-29 06:00", "2012-3-1 08:00", "2014-4-1 10:00", ] ), "fraud": [True, False, False, False, True], } ) datetime_es = EntitySet(id="fraud_data") datetime_es = datetime_es.add_dataframe( dataframe_name="transactions", dataframe=transactions_df, index="id", time_index="transaction_time", ) datetime_es = datetime_es.add_dataframe( dataframe_name="cards", dataframe=cards_df, index="id" ) datetime_es = datetime_es.add_relationship("cards", "id", "transactions", "card_id") datetime_es.add_last_time_indexes() return datetime_es
def datetime_es(): cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5], "card_id": [1, 1, 5, 1, 5], "transaction_time": pd.to_datetime([ '2011-2-28 04:00', '2012-2-28 05:00', '2012-2-29 06:00', '2012-3-1 08:00', '2014-4-1 10:00' ]), "fraud": [True, False, False, False, True] }) datetime_es = EntitySet(id="fraud_data") datetime_es = datetime_es.entity_from_dataframe( entity_id="transactions", dataframe=transactions_df, index="id", time_index="transaction_time") datetime_es = datetime_es.entity_from_dataframe(entity_id="cards", dataframe=cards_df, index="id") relationship = Relationship(datetime_es["cards"]["id"], datetime_es["transactions"]["card_id"]) datetime_es = datetime_es.add_relationship(relationship) datetime_es.add_last_time_indexes() return datetime_es
def description_to_entityset(description, **kwargs): """Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. """ check_schema_version(description, "entityset") from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get("path") entityset = EntitySet(description["id"]) for df in description["dataframes"].values(): if path is not None: data_path = os.path.join(path, "data", df["name"]) format = description.get("format") if format is not None: kwargs["format"] = format if format == "parquet" and df["loading_info"][ "table_type"] == "pandas": kwargs["filename"] = df["name"] + ".parquet" dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) for relationship in description["relationships"]: rel = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship=rel) return entityset
def test_operations_invalidate_metadata(es): new_es = EntitySet(id="test") # test metadata gets created on access assert new_es._data_description is None assert new_es.metadata is not None # generated after access assert new_es._data_description is not None if not isinstance(es['customers'].df, pd.DataFrame): customers_vtypes = es["customers"].variable_types customers_vtypes['signup_date'] = variable_types.Datetime else: customers_vtypes = None new_es.entity_from_dataframe("customers", es["customers"].df, index=es["customers"].index, variable_types=customers_vtypes) if not isinstance(es['sessions'].df, pd.DataFrame): sessions_vtypes = es["sessions"].variable_types else: sessions_vtypes = None new_es.entity_from_dataframe("sessions", es["sessions"].df, index=es["sessions"].index, variable_types=sessions_vtypes) assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None r = Relationship(new_es["customers"]["id"], new_es["sessions"]["customer_id"]) new_es = new_es.add_relationship(r) assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es = new_es.normalize_entity("customers", "cohort", "cohort") assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es.add_last_time_indexes() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None # automatically adding interesting values not supported in Dask or Koalas if any(isinstance(entity.df, pd.DataFrame) for entity in new_es.entities): new_es.add_interesting_values() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None
def test_operations_invalidate_metadata(es): new_es = EntitySet(id="test") # test metadata gets created on access assert new_es._data_description is None assert new_es.metadata is not None # generated after access assert new_es._data_description is not None if not isinstance(es['customers'], pd.DataFrame): customers_ltypes = es["customers"].ww.logical_types customers_ltypes['signup_date'] = Datetime else: customers_ltypes = None new_es.add_dataframe(es["customers"], "customers", index=es["customers"].index, logical_types=customers_ltypes) if not isinstance(es['sessions'], pd.DataFrame): sessions_ltypes = es["sessions"].ww.logical_types else: sessions_ltypes = None new_es.add_dataframe(es["sessions"], "sessions", index=es["sessions"].index, logical_types=sessions_ltypes) assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es = new_es.add_relationship("customers", "id", "sessions", "customer_id") assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es = new_es.normalize_dataframe("customers", "cohort", "cohort") assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es.add_last_time_indexes() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None # automatically adding interesting values not supported in Dask or Koalas if new_es.dataframe_type == Library.PANDAS.value: new_es.add_interesting_values() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None
def test_add_last_time_indexes(): pd_es = EntitySet(id="pd_es") dask_es = EntitySet(id="dask_es") sessions = pd.DataFrame({ "id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) sessions_dask = dd.from_pandas(sessions, npartitions=2) sessions_logical_types = { "id": Integer, "user": Integer, "time": Datetime, "strings": NaturalLanguage } transactions = pd.DataFrame({ "id": [0, 1, 2, 3, 4, 5], "session_id": [0, 0, 1, 2, 2, 3], "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], "time": [ pd.to_datetime('2019-01-10 03:53'), pd.to_datetime('2019-01-10 04:12'), pd.to_datetime('2019-02-03 10:34'), pd.to_datetime('2019-01-01 12:35'), pd.to_datetime('2019-01-01 12:49'), pd.to_datetime('2017-08-25 04:53') ] }) transactions_dask = dd.from_pandas(transactions, npartitions=2) transactions_logical_types = { "id": Integer, "session_id": Integer, "time": Datetime, "amount": Double } pd_es.add_dataframe(dataframe_name="sessions", dataframe=sessions, index="id", time_index="time") dask_es.add_dataframe(dataframe_name="sessions", dataframe=sessions_dask, index="id", time_index="time", logical_types=sessions_logical_types) pd_es.add_dataframe(dataframe_name="transactions", dataframe=transactions, index="id", time_index="time") dask_es.add_dataframe(dataframe_name="transactions", dataframe=transactions_dask, index="id", time_index="time", logical_types=transactions_logical_types) pd_es = pd_es.add_relationship("sessions", "id", "transactions", "session_id") dask_es = dask_es.add_relationship("sessions", "id", "transactions", "session_id") assert 'foreign_key' in pd_es['transactions'].ww.semantic_tags[ 'session_id'] assert 'foreign_key' in dask_es['transactions'].ww.semantic_tags[ 'session_id'] assert pd_es['sessions'].ww.metadata.get('last_time_index') is None assert dask_es['sessions'].ww.metadata.get('last_time_index') is None pd_es.add_last_time_indexes() dask_es.add_last_time_indexes() pd_lti_name = pd_es['sessions'].ww.metadata.get('last_time_index') ks_lti_name = dask_es['sessions'].ww.metadata.get('last_time_index') assert pd_lti_name == ks_lti_name pd.testing.assert_series_equal( pd_es['sessions'][pd_lti_name].sort_index(), dask_es['sessions'][ks_lti_name].compute().sort_index(), check_names=False)
def test_add_last_time_indexes(): pd_es = EntitySet(id="pd_es") spark_es = EntitySet(id="spark_es") sessions = pd.DataFrame( { "id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [ pd.to_datetime("2019-01-10"), pd.to_datetime("2019-02-03"), pd.to_datetime("2019-01-01"), pd.to_datetime("2017-08-25"), ], "strings": ["I am a string", "23", "abcdef ghijk", ""], } ) sessions_spark = ps.from_pandas(sessions) sessions_logical_types = { "id": Integer, "user": Integer, "strings": NaturalLanguage, "time": Datetime, } transactions = pd.DataFrame( { "id": [0, 1, 2, 3, 4, 5], "session_id": [0, 0, 1, 2, 2, 3], "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], "time": [ pd.to_datetime("2019-01-10 03:53"), pd.to_datetime("2019-01-10 04:12"), pd.to_datetime("2019-02-03 10:34"), pd.to_datetime("2019-01-01 12:35"), pd.to_datetime("2019-01-01 12:49"), pd.to_datetime("2017-08-25 04:53"), ], } ) transactions_spark = ps.from_pandas(transactions) transactions_logical_types = { "id": Integer, "session_id": Integer, "amount": Double, "time": Datetime, } pd_es.add_dataframe( dataframe_name="sessions", dataframe=sessions, index="id", time_index="time" ) spark_es.add_dataframe( dataframe_name="sessions", dataframe=sessions_spark, index="id", time_index="time", logical_types=sessions_logical_types, ) pd_es.add_dataframe( dataframe_name="transactions", dataframe=transactions, index="id", time_index="time", ) spark_es.add_dataframe( dataframe_name="transactions", dataframe=transactions_spark, index="id", time_index="time", logical_types=transactions_logical_types, ) pd_es = pd_es.add_relationship("sessions", "id", "transactions", "session_id") spark_es = spark_es.add_relationship("sessions", "id", "transactions", "session_id") assert "foreign_key" in pd_es["transactions"].ww.semantic_tags["session_id"] assert "foreign_key" in spark_es["transactions"].ww.semantic_tags["session_id"] assert pd_es["sessions"].ww.metadata.get("last_time_index") is None assert spark_es["sessions"].ww.metadata.get("last_time_index") is None pd_es.add_last_time_indexes() spark_es.add_last_time_indexes() pd_lti_name = pd_es["sessions"].ww.metadata.get("last_time_index") spark_lti_name = spark_es["sessions"].ww.metadata.get("last_time_index") assert pd_lti_name == spark_lti_name pd.testing.assert_series_equal( pd_es["sessions"][pd_lti_name].sort_index(), spark_es["sessions"][spark_lti_name].to_pandas().sort_index(), check_names=False, )