def make_ecommerce_entityset(with_integer_time_index=False): """ Makes a entityset with the following shape: R Regions / \\ . S C Stores, Customers | . S P Sessions, Products \\ / . L Log """ dataframes = make_ecommerce_dataframes( with_integer_time_index=with_integer_time_index) entities = dataframes.keys() es_id = 'ecommerce' if with_integer_time_index: es_id += "_int_time_index" variable_types = make_variable_types( with_integer_time_index=with_integer_time_index) time_indexes = make_time_indexes( with_integer_time_index=with_integer_time_index) es = EntitySet(id=es_id) for entity in entities: time_index = time_indexes.get(entity, None) ti_name = None secondary = None if time_index is not None: ti_name = time_index['name'] secondary = time_index['secondary'] df = dataframes[entity] es.entity_from_dataframe(entity, df, index='id', variable_types=variable_types[entity], time_index=ti_name, secondary_time_index=secondary) es.normalize_entity('customers', 'cohorts', 'cohort', additional_variables=['cohort_name'], make_time_index=True, new_entity_time_index='cohort_end') es.add_relationships([ Relationship(es[u'régions']['id'], es['customers'][u'région_id']), Relationship(es[u'régions']['id'], es['stores'][u'région_id']), Relationship(es['customers']['id'], es['sessions']['customer_id']), Relationship(es['sessions']['id'], es['log']['session_id']), Relationship(es['products']['id'], es['log']['product_id']) ]) return es
def test_sets_time_when_adding_entity(): transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True] }) accounts_df = pd.DataFrame({ "id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)] }) accounts_df_string = pd.DataFrame({ "id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"] }) # create empty entityset es = EntitySet("fraud") # assert it's not set assert getattr(es, "time_type", None) is None # add entity es.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert es.time_type == variable_types.NumericTimeIndex # add another entity es.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert es.time_type == variable_types.NumericTimeIndex # add wrong time type entity error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error_text): es.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index error_text = "Attempted to convert all string column signup_date to numeric" with pytest.raises(TypeError, match=error_text): es.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_sets_time_when_adding_entity(self): transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True] }) accounts_df = pd.DataFrame({ "id": [3, 4, 5], "signup_date": [ datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11) ] }) accounts_df_string = pd.DataFrame({ "id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"] }) # create empty entityset entityset = EntitySet("fraud") # assert it's not set assert getattr(entityset, "time_type", None) is None # add entity entityset.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert entityset.time_type == variable_types.NumericTimeIndex # add another entity entityset.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert entityset.time_type == variable_types.NumericTimeIndex # add wrong time type entity with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_sets_time_when_adding_entity(): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True]}) accounts_df = pd.DataFrame({"id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)]}) accounts_df_string = pd.DataFrame({"id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"]}) # create empty entityset entityset = EntitySet("fraud") # assert it's not set assert getattr(entityset, "time_type", None) is None # add entity entityset.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert entityset.time_type == variable_types.NumericTimeIndex # add another entity entityset.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert entityset.time_type == variable_types.NumericTimeIndex # add wrong time type entity error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error_text): entityset.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index error_text = "Attempted to convert all string column signup_date to numeric" with pytest.raises(TypeError, match=error_text): entityset.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_make_time_index_keeps_original_sorting(): trips = { 'trip_id': [999 - i for i in range(1000)], 'flight_time': [datetime(1997, 4, 1) for i in range(1000)], 'flight_id': [1 for i in range(350)] + [2 for i in range(650)] } order = [i for i in range(1000)] df = pd.DataFrame.from_dict(trips) es = EntitySet('flights') es.entity_from_dataframe("trips", dataframe=df, index="trip_id", time_index='flight_time') assert (es['trips'].df['trip_id'] == order).all() es.normalize_entity(base_entity_id="trips", new_entity_id="flights", index="flight_id", make_time_index=True) assert (es['trips'].df['trip_id'] == order).all()
def test_sets_time_when_adding_entity(self): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True]}) accounts_df = pd.DataFrame({"id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)]}) accounts_df_string = pd.DataFrame({"id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"]}) # create empty entityset entityset = EntitySet("fraud") # assert it's not set assert getattr(entityset, "time_type", None) is None # add entity entityset.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert entityset.time_type == variable_types.NumericTimeIndex # add another entity entityset.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert entityset.time_type == variable_types.NumericTimeIndex # add wrong time type entity with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_operations_invalidate_metadata(es): new_es = EntitySet(id="test") # test metadata gets created on access assert new_es._data_description is None assert new_es.metadata is not None # generated after access assert new_es._data_description is not None if not isinstance(es['customers'].df, pd.DataFrame): customers_vtypes = es["customers"].variable_types customers_vtypes['signup_date'] = variable_types.Datetime else: customers_vtypes = None new_es.entity_from_dataframe("customers", es["customers"].df, index=es["customers"].index, variable_types=customers_vtypes) if not isinstance(es['sessions'].df, pd.DataFrame): sessions_vtypes = es["sessions"].variable_types else: sessions_vtypes = None new_es.entity_from_dataframe("sessions", es["sessions"].df, index=es["sessions"].index, variable_types=sessions_vtypes) assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None r = Relationship(new_es["customers"]["id"], new_es["sessions"]["customer_id"]) new_es = new_es.add_relationship(r) assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es = new_es.normalize_entity("customers", "cohort", "cohort") assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None new_es.add_last_time_indexes() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None # automatically adding interesting values not supported in Dask or Koalas if any(isinstance(entity.df, pd.DataFrame) for entity in new_es.entities): new_es.add_interesting_values() assert new_es._data_description is None assert new_es.metadata is not None assert new_es._data_description is not None