def description_to_entityset(description, **kwargs): '''Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. ''' check_schema_version(description, 'entityset') from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get('path') entityset = EntitySet(description['id']) for df in description['dataframes'].values(): if path is not None: data_path = os.path.join(path, 'data', df['name']) dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) for relationship in description['relationships']: rel = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship=rel) return entityset
def datetime_es(): cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame( { "id": [1, 2, 3, 4, 5], "card_id": [1, 1, 5, 1, 5], "transaction_time": pd.to_datetime( [ "2011-2-28 04:00", "2012-2-28 05:00", "2012-2-29 06:00", "2012-3-1 08:00", "2014-4-1 10:00", ] ), "fraud": [True, False, False, False, True], } ) datetime_es = EntitySet(id="fraud_data") datetime_es = datetime_es.add_dataframe( dataframe_name="transactions", dataframe=transactions_df, index="id", time_index="transaction_time", ) datetime_es = datetime_es.add_dataframe( dataframe_name="cards", dataframe=cards_df, index="id" ) datetime_es = datetime_es.add_relationship("cards", "id", "transactions", "card_id") datetime_es.add_last_time_indexes() return datetime_es
def make_ecommerce_entityset(with_integer_time_index=False): """ Makes a entityset with the following shape: R Regions / \\ . S C Stores, Customers | . S P Sessions, Products \\ / . L Log """ dataframes = make_ecommerce_dataframes( with_integer_time_index=with_integer_time_index) dataframe_names = dataframes.keys() es_id = 'ecommerce' if with_integer_time_index: es_id += "_int_time_index" logical_types = make_logical_types( with_integer_time_index=with_integer_time_index) semantic_tags = make_semantic_tags() time_indexes = make_time_indexes( with_integer_time_index=with_integer_time_index) es = EntitySet(id=es_id) for df_name in dataframe_names: time_index = time_indexes.get(df_name, None) ti_name = None secondary = None if time_index is not None: ti_name = time_index['name'] secondary = time_index['secondary'] df = dataframes[df_name] es.add_dataframe(df, dataframe_name=df_name, index='id', logical_types=logical_types[df_name], semantic_tags=semantic_tags[df_name], time_index=ti_name, secondary_time_index=secondary) es.normalize_dataframe('customers', 'cohorts', 'cohort', additional_columns=['cohort_name'], make_time_index=True, new_dataframe_time_index='cohort_end') es.add_relationships([(u'régions', 'id', 'customers', u'région_id'), (u'régions', 'id', 'stores', u'région_id'), ('customers', 'id', 'sessions', 'customer_id'), ('sessions', 'id', 'log', 'session_id'), ('products', 'id', 'log', 'product_id')]) return es
def test_single_table_dask_entityset_with_instance_ids(): primitives_list = [ "absolute", "is_weekend", "year", "day", "num_characters", "num_words", ] instance_ids = [0, 1, 3] dask_es = EntitySet(id="dask_es") df = pd.DataFrame({ "id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime("2019-01-10"), pd.to_datetime("2019-02-03"), pd.to_datetime("2019-01-01"), pd.to_datetime("2017-08-25"), ], "strings": ["I am a string", "23", "abcdef ghijk", ""], }) values_dd = dd.from_pandas(df, npartitions=2) ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage} dask_es.add_dataframe(dataframe_name="data", dataframe=values_dd, index="id", logical_types=ltypes) dask_fm, _ = ft.dfs( entityset=dask_es, target_dataframe_name="data", trans_primitives=primitives_list, instance_ids=instance_ids, ) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe(dataframe_name="data", dataframe=df, index="id", logical_types=ltypes) fm, _ = ft.dfs( entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list, instance_ids=instance_ids, ) # Make sure both indexes are sorted the same dask_fm = dask_fm.compute().astype({"id": "int64"}) pd.testing.assert_frame_equal(fm, dask_fm.set_index("id").loc[fm.index], check_dtype=False)
def test_single_table_spark_entityset_with_instance_ids(): primitives_list = [ "absolute", "is_weekend", "year", "day", "num_characters", "num_words", ] instance_ids = [0, 1, 3] spark_es = EntitySet(id="spark_es") df = pd.DataFrame({ "id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime("2019-01-10"), pd.to_datetime("2019-02-03"), pd.to_datetime("2019-01-01"), pd.to_datetime("2017-08-25"), ], "strings": ["I am a string", "23", "abcdef ghijk", ""], }) values_dd = ps.from_pandas(df) ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage} spark_es.add_dataframe(dataframe_name="data", dataframe=values_dd, index="id", logical_types=ltypes) spark_fm, _ = ft.dfs( entityset=spark_es, target_dataframe_name="data", trans_primitives=primitives_list, instance_ids=instance_ids, ) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe(dataframe_name="data", dataframe=df, index="id", logical_types=ltypes) fm, _ = ft.dfs( entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list, instance_ids=instance_ids, ) spark_fm = spark_fm.to_pandas().astype({"id": "int64"}) spark_computed_fm = spark_fm.set_index("id").loc[fm.index] # Spark dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing pd.testing.assert_frame_equal(fm.astype(spark_computed_fm.dtypes), spark_computed_fm)
def test_single_table_dask_entityset_cutoff_time_df(): primitives_list = ['absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words'] dask_es = EntitySet(id="dask_es") df = pd.DataFrame({"id": [0, 1, 2], "values": [1, 12, -34], "dates": [pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01')], "strings": ["I am a string", "23", "abcdef ghijk"]}) values_dd = dd.from_pandas(df, npartitions=2) ltypes = { "values": IntegerNullable, "dates": Datetime, "strings": NaturalLanguage } dask_es.add_dataframe( dataframe_name="data", dataframe=values_dd, index="id", time_index="dates", logical_types=ltypes) ids = [0, 1, 2, 0] times = [pd.Timestamp("2019-01-05 04:00"), pd.Timestamp("2019-01-05 04:00"), pd.Timestamp("2019-01-05 04:00"), pd.Timestamp("2019-01-15 04:00")] labels = [True, False, True, False] cutoff_times = pd.DataFrame({"id": ids, "time": times, "labels": labels}, columns=["id", "time", "labels"]) dask_fm, _ = ft.dfs(entityset=dask_es, target_dataframe_name="data", trans_primitives=primitives_list, cutoff_time=cutoff_times) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe( dataframe_name="data", dataframe=df, index="id", time_index="dates", logical_types=ltypes) fm, _ = ft.dfs(entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list, cutoff_time=cutoff_times) # Because row ordering with Dask is not guaranteed, we need to sort on two columns to make sure that values # for instance id 0 are compared correctly. Also, make sure the boolean column has the same dtype. fm = fm.sort_values(['id', 'labels']) dask_fm = dask_fm.compute().set_index('id').sort_values(['id', 'labels']) dask_fm['IS_WEEKEND(dates)'] = dask_fm['IS_WEEKEND(dates)'].astype(fm['IS_WEEKEND(dates)'].dtype) pd.testing.assert_frame_equal(fm, dask_fm)
def test_single_table_dask_entityset_dates_not_sorted(): dask_es = EntitySet(id="dask_es") df = pd.DataFrame({ "id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime("2019-01-10"), pd.to_datetime("2019-02-03"), pd.to_datetime("2019-01-01"), pd.to_datetime("2017-08-25"), ], }) primitives_list = ["absolute", "is_weekend", "year", "day"] values_dd = dd.from_pandas(df, npartitions=1) ltypes = { "values": Integer, "dates": Datetime, } dask_es.add_dataframe( dataframe_name="data", dataframe=values_dd, index="id", time_index="dates", logical_types=ltypes, ) dask_fm, _ = ft.dfs( entityset=dask_es, target_dataframe_name="data", trans_primitives=primitives_list, max_depth=1, ) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe( dataframe_name="data", dataframe=df, index="id", time_index="dates", logical_types=ltypes, ) fm, _ = ft.dfs( entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list, max_depth=1, ) dask_fm = dask_fm.compute().astype({"id": "int64"}) pd.testing.assert_frame_equal(fm, dask_fm.set_index("id").loc[fm.index], check_dtype=False)
def test_all_ww_logical_types(): logical_types = list_logical_types()['type_string'].to_list() dataframe = pd.DataFrame(columns=logical_types) es = EntitySet() ltype_dict = {ltype: ltype for ltype in logical_types} ltype_dict['ordinal'] = Ordinal(order=[]) es.add_dataframe(dataframe=dataframe, dataframe_name='all_types', index='integer', logical_types=ltype_dict) description = serialize.entityset_to_description(es) _es = deserialize.description_to_entityset(description) assert es.__eq__(_es, deep=True)
def test_operations_invalidate_metadata(es): new_es = EntitySet(id="test") # test metadata gets created on access assert new_es._data_description is None assert new_es.metadata is not None # generated after access assert new_es._data_description is not None if not isinstance(es['customers'], pd.DataFrame): customers_ltypes = es["customers"].ww.logical_types customers_ltypes['signup_date'] = Datetime else: customers_ltypes = None new_es.add_dataframe(es["customers"], "customers", index=es["customers"].index, logical_types=customers_ltypes) if not isinstance(es['sessions'], pd.DataFrame): sessions_ltypes = es["sessions"].ww.logical_types else: sessions_ltypes = None new_es.add_dataframe(es["sessions"], "sessions", index=es["sessions"].index, logical_types=sessions_ltypes) assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es = new_es.add_relationship("customers", "id", "sessions", "customer_id") assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es = new_es.normalize_dataframe("customers", "cohort", "cohort") assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es.add_last_time_indexes() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None # automatically adding interesting values not supported in Dask or Koalas if new_es.dataframe_type == Library.PANDAS.value: new_es.add_interesting_values() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None
def test_add_dataframe_with_make_index(): values = [1, 12, -23, 27] df = pd.DataFrame({"values": values}) spark_df = ps.from_pandas(df) spark_es = EntitySet(id="spark_es") ltypes = {"values": "Integer"} spark_es.add_dataframe( dataframe_name="new_dataframe", dataframe=spark_df, make_index=True, index="new_index", logical_types=ltypes, ) expected_df = pd.DataFrame({"values": values, "new_index": range(len(values))}) pd.testing.assert_frame_equal(expected_df, spark_es["new_dataframe"].to_pandas())
def test_single_table_dask_entityset_with_instance_ids(): primitives_list = ['absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words'] instance_ids = [0, 1, 3] dask_es = EntitySet(id="dask_es") df = pd.DataFrame({"id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25')], "strings": ["I am a string", "23", "abcdef ghijk", ""]}) values_dd = dd.from_pandas(df, npartitions=2) ltypes = { "values": Integer, "dates": Datetime, "strings": NaturalLanguage } dask_es.add_dataframe( dataframe_name="data", dataframe=values_dd, index="id", logical_types=ltypes) dask_fm, _ = ft.dfs(entityset=dask_es, target_dataframe_name="data", trans_primitives=primitives_list, instance_ids=instance_ids) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe( dataframe_name="data", dataframe=df, index="id", logical_types=ltypes) fm, _ = ft.dfs(entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list, instance_ids=instance_ids) # Make sure both indexes are sorted the same pd.testing.assert_frame_equal(fm, dask_fm.compute().set_index('id').loc[fm.index])
def test_single_table_ks_entityset_ids_not_sorted(): primitives_list = [ 'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words' ] ks_es = EntitySet(id="ks_es") df = pd.DataFrame({ "id": [2, 0, 1, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) values_dd = ks.from_pandas(df) ltypes = { "values": Integer, "dates": Datetime, "strings": NaturalLanguage, } ks_es.add_dataframe(dataframe_name="data", dataframe=values_dd, index="id", logical_types=ltypes) ks_fm, _ = ft.dfs(entityset=ks_es, target_dataframe_name="data", trans_primitives=primitives_list) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe(dataframe_name="data", dataframe=df, index="id", logical_types=ltypes) fm, _ = ft.dfs(entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list) ks_computed_fm = ks_fm.to_pandas().set_index('id').loc[fm.index] # Koalas dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing pd.testing.assert_frame_equal(fm.astype(ks_computed_fm.dtypes), ks_computed_fm)
def test_add_dataframe_with_make_index(): values = [1, 12, -23, 27] df = pd.DataFrame({"values": values}) dask_df = dd.from_pandas(df, npartitions=2) dask_es = EntitySet(id="dask_es") logical_types = {"values": Integer} dask_es.add_dataframe(dataframe_name="new_dataframe", dataframe=dask_df, make_index=True, index="new_index", logical_types=logical_types) expected_df = pd.DataFrame({ "values": values, "new_index": range(len(values)) }) pd.testing.assert_frame_equal(expected_df, dask_es['new_dataframe'].compute())
def test_single_table_ks_entityset_dates_not_sorted(): ks_es = EntitySet(id="ks_es") df = pd.DataFrame({ "id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ] }) primitives_list = ['absolute', 'is_weekend', 'year', 'day'] values_dd = ks.from_pandas(df) ltypes = { "values": Integer, "dates": Datetime, } ks_es.add_dataframe(dataframe_name="data", dataframe=values_dd, index="id", time_index="dates", logical_types=ltypes) ks_fm, _ = ft.dfs(entityset=ks_es, target_dataframe_name="data", trans_primitives=primitives_list, max_depth=1) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe(dataframe_name="data", dataframe=df, index="id", time_index="dates", logical_types=ltypes) fm, _ = ft.dfs(entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list, max_depth=1) ks_fm = ks_fm.to_pandas().set_index('id').loc[fm.index] pd.testing.assert_frame_equal(fm.astype(ks_fm.dtypes), ks_fm)
def description_to_entityset(description, **kwargs): """Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. """ check_schema_version(description, "entityset") from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get("path") entityset = EntitySet(description["id"]) for df in description["dataframes"].values(): if path is not None: data_path = os.path.join(path, "data", df["name"]) format = description.get("format") if format is not None: kwargs["format"] = format if format == "parquet" and df["loading_info"][ "table_type"] == "pandas": kwargs["filename"] = df["name"] + ".parquet" dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) for relationship in description["relationships"]: rel = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship=rel) return entityset
def test_add_dataframe(pd_es): dask_es = EntitySet(id="dask_es") log_dask = dd.from_pandas(pd_es["log"], npartitions=2) dask_es = dask_es.add_dataframe( dataframe_name="log_dask", dataframe=log_dask, index="id", time_index="datetime", logical_types=pd_es["log"].ww.logical_types, semantic_tags=get_df_tags(pd_es["log"])) pd.testing.assert_frame_equal(pd_es["log"], dask_es["log_dask"].compute(), check_like=True)
def test_with_custom_ww_logical_type(): class CustomLogicalType(LogicalType): pass ww_type_system.add_type(CustomLogicalType) columns = ['integer', 'natural_language', 'custom_logical_type'] dataframe = pd.DataFrame(columns=columns) es = EntitySet() ltype_dict = { 'integer': 'integer', 'natural_language': 'natural_language', 'custom_logical_type': CustomLogicalType, } es.add_dataframe(dataframe=dataframe, dataframe_name='custom_type', index='integer', logical_types=ltype_dict) description = serialize.entityset_to_description(es) _es = deserialize.description_to_entityset(description) assert isinstance( _es['custom_type'].ww.logical_types['custom_logical_type'], CustomLogicalType) assert es.__eq__(_es, deep=True)
def test_add_dataframe_from_ks_df(pd_es): cleaned_df = pd_to_ks_clean(pd_es["log"]) log_ks = ks.from_pandas(cleaned_df) ks_es = EntitySet(id="ks_es") ks_es = ks_es.add_dataframe(dataframe_name="log_ks", dataframe=log_ks, index="id", time_index="datetime", logical_types=pd_es["log"].ww.logical_types, semantic_tags=get_df_tags(pd_es["log"])) pd.testing.assert_frame_equal(cleaned_df, ks_es["log_ks"].to_pandas(), check_like=True)
def test_add_dataframe_from_spark_df(pd_es): cleaned_df = pd_to_spark_clean(pd_es["log"]) log_spark = ps.from_pandas(cleaned_df) spark_es = EntitySet(id="spark_es") spark_es = spark_es.add_dataframe( dataframe_name="log_spark", dataframe=log_spark, index="id", time_index="datetime", logical_types=pd_es["log"].ww.logical_types, semantic_tags=get_df_tags(pd_es["log"]), ) pd.testing.assert_frame_equal( cleaned_df, spark_es["log_spark"].to_pandas(), check_like=True )
def test_add_last_time_indexes(): pd_es = EntitySet(id="pd_es") spark_es = EntitySet(id="spark_es") sessions = pd.DataFrame( { "id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [ pd.to_datetime("2019-01-10"), pd.to_datetime("2019-02-03"), pd.to_datetime("2019-01-01"), pd.to_datetime("2017-08-25"), ], "strings": ["I am a string", "23", "abcdef ghijk", ""], } ) sessions_spark = ps.from_pandas(sessions) sessions_logical_types = { "id": Integer, "user": Integer, "strings": NaturalLanguage, "time": Datetime, } transactions = pd.DataFrame( { "id": [0, 1, 2, 3, 4, 5], "session_id": [0, 0, 1, 2, 2, 3], "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], "time": [ pd.to_datetime("2019-01-10 03:53"), pd.to_datetime("2019-01-10 04:12"), pd.to_datetime("2019-02-03 10:34"), pd.to_datetime("2019-01-01 12:35"), pd.to_datetime("2019-01-01 12:49"), pd.to_datetime("2017-08-25 04:53"), ], } ) transactions_spark = ps.from_pandas(transactions) transactions_logical_types = { "id": Integer, "session_id": Integer, "amount": Double, "time": Datetime, } pd_es.add_dataframe( dataframe_name="sessions", dataframe=sessions, index="id", time_index="time" ) spark_es.add_dataframe( dataframe_name="sessions", dataframe=sessions_spark, index="id", time_index="time", logical_types=sessions_logical_types, ) pd_es.add_dataframe( dataframe_name="transactions", dataframe=transactions, index="id", time_index="time", ) spark_es.add_dataframe( dataframe_name="transactions", dataframe=transactions_spark, index="id", time_index="time", logical_types=transactions_logical_types, ) pd_es = pd_es.add_relationship("sessions", "id", "transactions", "session_id") spark_es = spark_es.add_relationship("sessions", "id", "transactions", "session_id") assert "foreign_key" in pd_es["transactions"].ww.semantic_tags["session_id"] assert "foreign_key" in spark_es["transactions"].ww.semantic_tags["session_id"] assert pd_es["sessions"].ww.metadata.get("last_time_index") is None assert spark_es["sessions"].ww.metadata.get("last_time_index") is None pd_es.add_last_time_indexes() spark_es.add_last_time_indexes() pd_lti_name = pd_es["sessions"].ww.metadata.get("last_time_index") spark_lti_name = spark_es["sessions"].ww.metadata.get("last_time_index") assert pd_lti_name == spark_lti_name pd.testing.assert_series_equal( pd_es["sessions"][pd_lti_name].sort_index(), spark_es["sessions"][spark_lti_name].to_pandas().sort_index(), check_names=False, )
def make_ecommerce_entityset(with_integer_time_index=False): """Makes a entityset with the following shape: R Regions / \\ . S C Stores, Customers | . S P Sessions, Products \\ / . L Log """ dataframes = make_ecommerce_dataframes( with_integer_time_index=with_integer_time_index ) dataframe_names = dataframes.keys() es_id = "ecommerce" if with_integer_time_index: es_id += "_int_time_index" logical_types = make_logical_types(with_integer_time_index=with_integer_time_index) semantic_tags = make_semantic_tags() time_indexes = make_time_indexes(with_integer_time_index=with_integer_time_index) es = EntitySet(id=es_id) for df_name in dataframe_names: time_index = time_indexes.get(df_name, None) ti_name = None secondary = None if time_index is not None: ti_name = time_index["name"] secondary = time_index["secondary"] df = dataframes[df_name] es.add_dataframe( df, dataframe_name=df_name, index="id", logical_types=logical_types[df_name], semantic_tags=semantic_tags[df_name], time_index=ti_name, secondary_time_index=secondary, ) es.normalize_dataframe( "customers", "cohorts", "cohort", additional_columns=["cohort_name"], make_time_index=True, new_dataframe_time_index="cohort_end", ) es.add_relationships( [ ("régions", "id", "customers", "région_id"), ("régions", "id", "stores", "région_id"), ("customers", "id", "sessions", "customer_id"), ("sessions", "id", "log", "session_id"), ("products", "id", "log", "product_id"), ] ) return es
def test_add_last_time_indexes(): pd_es = EntitySet(id="pd_es") dask_es = EntitySet(id="dask_es") sessions = pd.DataFrame({ "id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) sessions_dask = dd.from_pandas(sessions, npartitions=2) sessions_logical_types = { "id": Integer, "user": Integer, "time": Datetime, "strings": NaturalLanguage } transactions = pd.DataFrame({ "id": [0, 1, 2, 3, 4, 5], "session_id": [0, 0, 1, 2, 2, 3], "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], "time": [ pd.to_datetime('2019-01-10 03:53'), pd.to_datetime('2019-01-10 04:12'), pd.to_datetime('2019-02-03 10:34'), pd.to_datetime('2019-01-01 12:35'), pd.to_datetime('2019-01-01 12:49'), pd.to_datetime('2017-08-25 04:53') ] }) transactions_dask = dd.from_pandas(transactions, npartitions=2) transactions_logical_types = { "id": Integer, "session_id": Integer, "time": Datetime, "amount": Double } pd_es.add_dataframe(dataframe_name="sessions", dataframe=sessions, index="id", time_index="time") dask_es.add_dataframe(dataframe_name="sessions", dataframe=sessions_dask, index="id", time_index="time", logical_types=sessions_logical_types) pd_es.add_dataframe(dataframe_name="transactions", dataframe=transactions, index="id", time_index="time") dask_es.add_dataframe(dataframe_name="transactions", dataframe=transactions_dask, index="id", time_index="time", logical_types=transactions_logical_types) pd_es = pd_es.add_relationship("sessions", "id", "transactions", "session_id") dask_es = dask_es.add_relationship("sessions", "id", "transactions", "session_id") assert 'foreign_key' in pd_es['transactions'].ww.semantic_tags[ 'session_id'] assert 'foreign_key' in dask_es['transactions'].ww.semantic_tags[ 'session_id'] assert pd_es['sessions'].ww.metadata.get('last_time_index') is None assert dask_es['sessions'].ww.metadata.get('last_time_index') is None pd_es.add_last_time_indexes() dask_es.add_last_time_indexes() pd_lti_name = pd_es['sessions'].ww.metadata.get('last_time_index') ks_lti_name = dask_es['sessions'].ww.metadata.get('last_time_index') assert pd_lti_name == ks_lti_name pd.testing.assert_series_equal( pd_es['sessions'][pd_lti_name].sort_index(), dask_es['sessions'][ks_lti_name].compute().sort_index(), check_names=False)
def test_single_table_spark_entityset_cutoff_time_df(): primitives_list = [ "absolute", "is_weekend", "year", "day", "num_characters", "num_words", ] spark_es = EntitySet(id="spark_es") df = pd.DataFrame({ "id": [0, 1, 2], "values": [1, 12, -34], "dates": [ pd.to_datetime("2019-01-10"), pd.to_datetime("2019-02-03"), pd.to_datetime("2019-01-01"), ], "strings": ["I am a string", "23", "abcdef ghijk"], }) values_dd = ps.from_pandas(df) ltypes = { "values": IntegerNullable, "dates": Datetime, "strings": NaturalLanguage } spark_es.add_dataframe( dataframe_name="data", dataframe=values_dd, index="id", time_index="dates", logical_types=ltypes, ) ids = [0, 1, 2, 0] times = [ pd.Timestamp("2019-01-05 04:00"), pd.Timestamp("2019-01-05 04:00"), pd.Timestamp("2019-01-05 04:00"), pd.Timestamp("2019-01-15 04:00"), ] labels = [True, False, True, False] cutoff_times = pd.DataFrame({ "id": ids, "time": times, "labels": labels }, columns=["id", "time", "labels"]) spark_fm, _ = ft.dfs( entityset=spark_es, target_dataframe_name="data", trans_primitives=primitives_list, cutoff_time=cutoff_times, ) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe( dataframe_name="data", dataframe=df, index="id", time_index="dates", logical_types=ltypes, ) fm, _ = ft.dfs( entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list, cutoff_time=cutoff_times, ) # Because row ordering with spark is not guaranteed, `we need to sort on two columns to make sure that values # for instance id 0 are compared correctly. Also, make sure the index column has the same dtype. fm = fm.sort_values(["id", "labels"]) spark_fm = spark_fm.to_pandas().astype({"id": "int64"}) spark_fm = spark_fm.set_index("id").sort_values(["id", "labels"]) for column in fm.columns: if fm[column].dtype.name == "category": fm[column] = fm[column].astype("Int64").astype("string") pd.testing.assert_frame_equal(fm.astype(spark_fm.dtypes), spark_fm, check_dtype=False)