def test_multiple_children_both_missing(self, entityset, extra_session_df, wishlist_df, true_sessions_lti): # test all instances in neither child sessions = entityset['sessions'] # add row to sessions to create session with no events sessions.update_data(extra_session_df) entityset.entity_from_dataframe(entity_id="wishlist_log", dataframe=wishlist_df, index='id', make_index=True, time_index='datetime') relationship = Relationship(entityset['sessions']['id'], entityset['wishlist_log']['session_id']) entityset.add_relationship(relationship) entityset.add_last_time_indexes() sessions = entityset['sessions'] # wishlist has 2 newer events and one is NaT true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00") true_sessions_lti[6] = pd.NaT assert len(sessions.last_time_index) == 7 sorted_lti = sessions.last_time_index.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_add_relationship_errors_on_dtype_mismatch(es): log_2_df = es['log'].df.copy() log_variable_types = { 'id': variable_types.Categorical, 'session_id': variable_types.Id, 'product_id': variable_types.Id, 'datetime': variable_types.Datetime, 'value': variable_types.Numeric, 'value_2': variable_types.Numeric, 'latlong': variable_types.LatLong, 'latlong2': variable_types.LatLong, 'value_many_nans': variable_types.Numeric, 'priority_level': variable_types.Ordinal, 'purchased': variable_types.Boolean, 'comments': variable_types.Text } es.entity_from_dataframe(entity_id='log2', dataframe=log_2_df, index='id', variable_types=log_variable_types, time_index='datetime', encoding='utf-8') with pytest.raises(ValueError) as e: mismatch = Relationship(es['regions']['id'], es['log2']['session_id']) es.add_relationship(mismatch) assert e.value.__str__() == "Unable to add relationship because id in "\ "regions is Pandas dtype object and "\ "session_id in log2 is Pandas dtype int64."
def test_multiple_children_left_missing(self, entityset, extra_session_df, wishlist_df, true_sessions_lti): # test all instances in right child sessions = entityset['sessions'] # add row to sessions so not all session instances are in log sessions.update_data(extra_session_df) # add row to wishlist df so new session instance in in wishlist_log row_values = {'session_id': 6, 'datetime': pd.Timestamp("2011-04-11 11:11:11"), 'product_id': 'toothpaste'} row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = wishlist_df.append(row) entityset.entity_from_dataframe(entity_id="wishlist_log", dataframe=df, index='id', make_index=True, time_index='datetime') relationship = Relationship(entityset['sessions']['id'], entityset['wishlist_log']['session_id']) entityset.add_relationship(relationship) entityset.add_last_time_indexes() # now wishlist_log has newer events for 3 session ids true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00") true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11") assert len(sessions.last_time_index) == 7 sorted_lti = sessions.last_time_index.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_multiple_children_all_combined(self, entityset, extra_session_df, wishlist_df, true_sessions_lti): # test some instances in right, some in left, all when combined sessions = entityset['sessions'] # add row to sessions so not all session instances are in log sessions.update_data(extra_session_df) # add row to wishlist_log so extra session has child instance row_values = {'session_id': 6, 'datetime': pd.Timestamp("2011-04-11 11:11:11"), 'product_id': 'toothpaste'} row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = wishlist_df.append(row) # drop instance 4 so wishlist_log does not have session id 3 instance df.drop(4, inplace=True) entityset.entity_from_dataframe(entity_id="wishlist_log", dataframe=df, index='id', make_index=True, time_index='datetime') relationship = Relationship(entityset['sessions']['id'], entityset['wishlist_log']['session_id']) entityset.add_relationship(relationship) entityset.add_last_time_indexes() # wishlist has newer events for 2 sessions true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11") assert len(sessions.last_time_index) == 7 sorted_lti = sessions.last_time_index.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_multiple_children(self, es, wishlist_df, true_sessions_lti): # test all instances in both children if isinstance(es.entities[0].df, dd.DataFrame): wishlist_df = dd.from_pandas(wishlist_df, npartitions=2) variable_types = { 'id': ft.variable_types.variable.Index, 'session_id': ft.variable_types.variable.Numeric, 'datetime': ft.variable_types.variable.DatetimeTimeIndex, 'product_id': ft.variable_types.variable.Categorical } es.entity_from_dataframe(entity_id="wishlist_log", dataframe=wishlist_df, index='id', make_index=True, time_index='datetime', variable_types=variable_types) relationship = Relationship(es['sessions']['id'], es['wishlist_log']['session_id']) es.add_relationship(relationship) es.add_last_time_indexes() sessions = es['sessions'] # wishlist df has more recent events for two session ids true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00") assert len(sessions.last_time_index) == 6 lti = sessions.last_time_index if isinstance(lti, dd.Series): lti = lti.compute() sorted_lti = lti.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_add_relationship_errors_on_dtype_mismatch(es): log_2_df = es['log'].df.copy() log_variable_types = { 'id': variable_types.Categorical, 'session_id': variable_types.Id, 'product_id': variable_types.Id, 'datetime': variable_types.Datetime, 'value': variable_types.Numeric, 'value_2': variable_types.Numeric, 'latlong': variable_types.LatLong, 'latlong2': variable_types.LatLong, 'value_many_nans': variable_types.Numeric, 'priority_level': variable_types.Ordinal, 'purchased': variable_types.Boolean, 'comments': variable_types.Text } es.entity_from_dataframe(entity_id='log2', dataframe=log_2_df, index='id', variable_types=log_variable_types, time_index='datetime', encoding='utf-8') with pytest.raises(ValueError): mismatch = Relationship(es[u'régions']['id'], es['log2']['session_id']) es.add_relationship(mismatch)
def test_multiple_children_right_missing(self, es, wishlist_df, true_sessions_lti): # test all instances in left child sessions = es['sessions'] # drop wishlist instance related to id 3 so it's only in log wishlist_df.drop(4, inplace=True) if isinstance(es.entities[0].df, dd.DataFrame): wishlist_df = dd.from_pandas(wishlist_df, npartitions=2) if ks and isinstance(es.entities[0].df, ks.DataFrame): wishlist_df = ks.from_pandas(wishlist_df) variable_types = { 'id': ft.variable_types.variable.Index, 'session_id': ft.variable_types.variable.Numeric, 'datetime': ft.variable_types.variable.DatetimeTimeIndex, 'product_id': ft.variable_types.variable.Categorical } es.entity_from_dataframe(entity_id="wishlist_log", dataframe=wishlist_df, index='id', make_index=True, time_index='datetime', variable_types=variable_types) relationship = Relationship(es['sessions']['id'], es['wishlist_log']['session_id']) es.add_relationship(relationship) es.add_last_time_indexes() # now only session id 1 has newer event in wishlist_log true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") assert len(sessions.last_time_index) == 6 sorted_lti = to_pandas(sessions.last_time_index).sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def from_dictionary(cls, arguments, entityset, dependencies, primitives_deserializer): base_features = [ dependencies[name] for name in arguments['base_features'] ] relationship_path = [ Relationship.from_dictionary(r, entityset) for r in arguments['relationship_path'] ] parent_entity = relationship_path[0].parent_entity relationship_path = RelationshipPath([(False, r) for r in relationship_path]) primitive = primitives_deserializer.deserialize_primitive( arguments['primitive']) use_previous_data = arguments['use_previous'] use_previous = use_previous_data and Timedelta.from_dictionary( use_previous_data) where_name = arguments['where'] where = where_name and dependencies[where_name] return cls(base_features=base_features, parent_entity=parent_entity, primitive=primitive, relationship_path=relationship_path, use_previous=use_previous, where=where, name=arguments['name'])
def from_dictionary(cls, arguments, entityset, dependencies, primitives_deserializer): base_feature = dependencies[arguments['base_feature']] relationship = Relationship.from_dictionary(arguments['relationship'], entityset) child_entity = relationship.child_entity return cls(base_feature=base_feature, child_entity=child_entity, relationship=relationship, name=arguments['name'])
def test_last_time_index(self, entityset): es = entityset es.normalize_entity('log', 'values', 'value', make_time_index=True, new_entity_time_index="value_time", convert_links_to_integers=True) es.add_last_time_indexes() assert es["values"].last_time_index is not None times = { 'values': [ datetime(2011, 4, 10, 10, 41, 0), datetime(2011, 4, 10, 10, 40, 1), datetime(2011, 4, 9, 10, 30, 12), datetime(2011, 4, 9, 10, 30, 18), datetime(2011, 4, 9, 10, 30, 24), datetime(2011, 4, 9, 10, 31, 9), datetime(2011, 4, 9, 10, 31, 18), datetime(2011, 4, 9, 10, 31, 27), datetime(2011, 4, 10, 10, 41, 3), datetime(2011, 4, 10, 10, 41, 6), datetime(2011, 4, 10, 11, 10, 3), ], 'customers': [ datetime(2011, 4, 9, 10, 40, 0), datetime(2011, 4, 10, 10, 41, 6), datetime(2011, 4, 10, 11, 10, 3), ] } region_series = pd.Series( {'United States': datetime(2011, 4, 10, 11, 10, 3)}) values_lti = es["values"].last_time_index.sort_index() customers_lti = es["customers"].last_time_index.sort_index() regions_lti = es["regions"].last_time_index.sort_index() assert (values_lti == pd.Series(times['values'])).all() assert (customers_lti == pd.Series(times['customers'])).all() assert (regions_lti == region_series).all() # add promotions entity promotions_df = pd.DataFrame({ "start_date": [datetime(2011, 4, 10, 11, 12, 6)], "store_id": [4], "product_id": ['coke zero'] }) es.entity_from_dataframe(entity_id="promotions", dataframe=promotions_df, index='id', make_index=True, time_index='start_date') relationship = Relationship(es['stores']['id'], es['promotions']['store_id']) es.add_relationship(relationship) es.add_last_time_indexes() region_series['Mexico'] = datetime(2011, 4, 10, 11, 12, 6) regions_lti = es["regions"].last_time_index.sort_index() assert (regions_lti == region_series.sort_index()).all()
def from_dictionary(cls, arguments, entityset, dependencies, primitives_deserializer): base_feature = dependencies[arguments['base_feature']] relationship_path = [ Relationship.from_dictionary(r, entityset) for r in arguments['relationship_path'] ] child_entity = relationship_path[0].child_entity return cls(base_feature, child_entity, relationship_path=relationship_path)
def main(): holding = Holding( 'HDFCBANK', Price('buy', 97.89), 5, 112.1, 95.4 ) print(holding.to_json()) es = EntitySet('user').entity_from_dataframe( entity_id='parent', dataframe=pd.DataFrame({ 'id': [10, 14, 24, 34, 54, 64, 84, ], 'age': [10, 14, 24, 34, 54, 64, 84, ], 'salary': [1, 1, 10000, 25000, 50000, 60000, 80000, ] })) es.entity_from_dataframe( entity_id='child', dataframe=pd.DataFrame({ 'id': [10, 14, 24, 34, 54, 64, 84, ], 'user_id': [10, 14, 24, 34, 54, 64, 84, ], 'sex': [0, 1, 0, 1, 1, 0, 1] })) es.add_relationship( Relationship( es['parent']['id'], es['child']['user_id'] ) ) es.entity_from_dataframe( entity_id='child_0', dataframe=pd.DataFrame({ 'id': [10, 14, 24, 34, 54, 64, 84, ], 'user_id': [10, 14, 24, 34, 54, 64, 84, ], 'diet': [0, 1, 0, 1, 1, 0, 1] })) es.add_relationship( Relationship( es['parent']['id'], es['child_0']['user_id'] ) ) # print(es) e_s = CustomEntity(entity_set=es) print(e_s.to_json())
def test_multiple_children_all_combined(self, es, extra_session_df, wishlist_df, true_sessions_lti): # test some instances in right, some in left, all when combined sessions = es['sessions'] # add row to sessions so not all session instances are in log sessions.update_data(extra_session_df) # add row to wishlist_log so extra session has child instance row_values = { 'session_id': 6, 'datetime': pd.Timestamp("2011-04-11 11:11:11"), 'product_id': 'toothpaste' } row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = wishlist_df.append(row) # drop instance 4 so wishlist_log does not have session id 3 instance df.drop(4, inplace=True) if isinstance(es.entities[0].df, dd.DataFrame): df = dd.from_pandas(df, npartitions=2) variable_types = { 'id': ft.variable_types.variable.Index, 'session_id': ft.variable_types.variable.Numeric, 'datetime': ft.variable_types.variable.DatetimeTimeIndex, 'product_id': ft.variable_types.variable.Categorical } es.entity_from_dataframe(entity_id="wishlist_log", dataframe=df, index='id', make_index=True, time_index='datetime', variable_types=variable_types) relationship = Relationship(es['sessions']['id'], es['wishlist_log']['session_id']) es.add_relationship(relationship) es.add_last_time_indexes() # wishlist has newer events for 2 sessions true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11") assert len(sessions.last_time_index) == 7 lti = sessions.last_time_index if isinstance(lti, dd.Series): lti = lti.compute() sorted_lti = lti.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_multiple_children_left_missing(self, es, extra_session_df, wishlist_df, true_sessions_lti): # test all instances in right child sessions = es['sessions'] # add row to sessions so not all session instances are in log sessions.update_data(extra_session_df) # add row to wishlist df so new session instance in in wishlist_log row_values = { 'session_id': 6, 'datetime': pd.Timestamp("2011-04-11 11:11:11"), 'product_id': 'toothpaste' } row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8)) df = wishlist_df.append(row) if isinstance(es.entities[0].df, dd.DataFrame): df = dd.from_pandas(df, npartitions=2) if ks and isinstance(es.entities[0].df, ks.DataFrame): df = ks.from_pandas(df) variable_types = { 'id': ft.variable_types.variable.Index, 'session_id': ft.variable_types.variable.Numeric, 'datetime': ft.variable_types.variable.DatetimeTimeIndex, 'product_id': ft.variable_types.variable.Categorical } es.entity_from_dataframe(entity_id="wishlist_log", dataframe=df, index='id', make_index=True, time_index='datetime', variable_types=variable_types) relationship = Relationship(es['sessions']['id'], es['wishlist_log']['session_id']) es.add_relationship(relationship) es.add_last_time_indexes() # now wishlist_log has newer events for 3 session ids true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00") true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11") assert len(sessions.last_time_index) == 7 sorted_lti = to_pandas(sessions.last_time_index).sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_multiple_children(self, es, wishlist_df, true_sessions_lti): # test all instances in both children es.entity_from_dataframe(entity_id="wishlist_log", dataframe=wishlist_df, index='id', make_index=True, time_index='datetime') relationship = Relationship(es['sessions']['id'], es['wishlist_log']['session_id']) es.add_relationship(relationship) es.add_last_time_indexes() sessions = es['sessions'] # wishlist df has more recent events for two session ids true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00") assert len(sessions.last_time_index) == 6 sorted_lti = sessions.last_time_index.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_multiple_children_both_missing(self, es, extra_session_df, wishlist_df, true_sessions_lti): # test all instances in neither child sessions = es['sessions'] if isinstance(es.entities[0].df, dd.DataFrame): wishlist_df = dd.from_pandas(wishlist_df, npartitions=2) variable_types = { 'id': ft.variable_types.variable.Index, 'session_id': ft.variable_types.variable.Numeric, 'datetime': ft.variable_types.variable.DatetimeTimeIndex, 'product_id': ft.variable_types.variable.Categorical } # add row to sessions to create session with no events sessions.update_data(extra_session_df) es.entity_from_dataframe(entity_id="wishlist_log", dataframe=wishlist_df, index='id', make_index=True, time_index='datetime', variable_types=variable_types) relationship = Relationship(es['sessions']['id'], es['wishlist_log']['session_id']) es.add_relationship(relationship) es.add_last_time_indexes() sessions = es['sessions'] # wishlist has 2 newer events and one is NaT true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00") true_sessions_lti[6] = pd.NaT assert len(sessions.last_time_index) == 7 lti = sessions.last_time_index if isinstance(lti, dd.Series): lti = lti.compute() sorted_lti = lti.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
def test_multiple_children_right_missing(self, entityset, wishlist_df, true_sessions_lti): # test all instances in left child sessions = entityset['sessions'] # drop wishlist instance related to id 3 so it's only in log wishlist_df.drop(4, inplace=True) entityset.entity_from_dataframe(entity_id="wishlist_log", dataframe=wishlist_df, index='id', make_index=True, time_index='datetime') relationship = Relationship(entityset['sessions']['id'], entityset['wishlist_log']['session_id']) entityset.add_relationship(relationship) entityset.add_last_time_indexes() # now only session id 1 has newer event in wishlist_log true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30") assert len(sessions.last_time_index) == 6 sorted_lti = sessions.last_time_index.sort_index() for v1, v2 in zip(sorted_lti, true_sessions_lti): assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
assert (values_lti == pd.Series(times['values'])).all() assert (customers_lti == pd.Series(times['customers'])).all() assert (regions_lti == region_series).all() # add promotions entity promotions_df = pd.DataFrame({ "start_date": [datetime(2011, 4, 10, 11, 12, 06)], "store_id": [4], "product_id": ['coke zero'] }) es.entity_from_dataframe(entity_id="promotions", dataframe=promotions_df, index='id', make_index=True, time_index='start_date') relationship = Relationship(es['stores']['id'], es['promotions']['store_id']) es.add_relationship(relationship) es.add_last_time_indexes() region_series['Mexico'] = datetime(2011, 4, 10, 11, 12, 06) regions_lti = es["regions"].last_time_index.sort_index() assert (regions_lti == region_series.sort_index()).all() def test_head_of_entity(entityset): entity = entityset['log'] assert(isinstance(entityset.head('log', 3), pd.DataFrame)) assert(isinstance(entity.head(3), pd.DataFrame)) assert(isinstance(entity['product_id'].head(3), pd.DataFrame)) assert(entity.head(n=5).shape == (5, 9))
def test_add_parent_not_index_varible(es): with pytest.raises(AttributeError): es.add_relationship( Relationship(es['regions']['language'], es['customers']['region_id']))
def test_add_parent_not_index_variable(es): error_text = "Parent variable.*is not the index of entity Entity.*" with pytest.raises(AttributeError, match=error_text): es.add_relationship( Relationship(es[u'régions']['language'], es['customers'][u'région_id']))