def test_handles_datetime_format(): # check if we load according to the format string # pass in an ambigious date datetime_format = "%d-%m-%Y" actual = pd.Timestamp('Jan 2, 2011') time_strs = [actual.strftime(datetime_format)] * 3 df = pd.DataFrame( {'id': [0, 1, 2], 'time_format': time_strs, 'time_no_format': time_strs}) vtypes = {'id': variable_types.Categorical, 'time_format': (variable_types.Datetime, {"format": datetime_format}), 'time_no_format': variable_types.Datetime} entityset = EntitySet(id='test') entityset.entity_from_dataframe( entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df) col_format = entityset['test_entity'].df['time_format'] col_no_format = entityset['test_entity'].df['time_no_format'] # without formatting pandas gets it wrong assert (col_no_format != actual).all() # with formatting we correctly get jan2 assert (col_format == actual).all()
def test_converts_variable_type_after_init(): df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a'], 'ints': ['1', '2', '1']}) df["category"] = df["category"].astype("category") entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', dataframe=df) e = entityset['test_entity'] df = entityset['test_entity'].df e.convert_variable_type('ints', variable_types.Numeric) assert isinstance(e['ints'], variable_types.Numeric) assert df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics e.convert_variable_type('ints', variable_types.Categorical) assert isinstance(e['ints'], variable_types.Categorical) e.convert_variable_type('ints', variable_types.Ordinal) assert isinstance(e['ints'], variable_types.Ordinal) e.convert_variable_type('ints', variable_types.Boolean, true_val=1, false_val=2) assert isinstance(e['ints'], variable_types.Boolean) assert df['ints'].dtype.name == 'bool'
def test_check_variables_and_dataframe(): # matches df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) vtypes = {'id': variable_types.Categorical, 'category': variable_types.Categorical} entityset = EntitySet(id='test') entityset.entity_from_dataframe('test_entity', df, index='id', variable_types=vtypes) assert entityset.entity_dict['test_entity'].variable_types['category'] == variable_types.Categorical
def test_bad_time_index_variable(): df = pd.DataFrame({'category': ['a', 'b', 'a']}) error_text = "Time index not found in dataframe" with pytest.raises(LookupError, match=error_text): entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index="id", dataframe=df, time_index='time')
def test_none_index(): df = pd.DataFrame({'category': [1, 2, 3], 'category2': ['1', '2', '3']}) vtypes = {'category': variable_types.Categorical, 'category2': variable_types.Categorical} entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', dataframe=df, variable_types=vtypes) assert entityset['test_entity'].index == 'category' assert isinstance(entityset['test_entity']['category'], variable_types.Index)
def test_handles_datetime_mismatch(): # can't convert arbitrary strings df = pd.DataFrame({'id': [0, 1, 2], 'time': ['a', 'b', 'tomorrow']}) vtypes = {'id': variable_types.Categorical, 'time': variable_types.Datetime} with pytest.raises(ValueError): entityset = EntitySet(id='test') entityset.entity_from_dataframe('test_entity', df, 'id', time_index='time', variable_types=vtypes)
def test_unknown_index(): # more variables df = pd.DataFrame({'category': ['a', 'b', 'a']}) vtypes = {'category': variable_types.Categorical} entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df) assert entityset['test_entity'].index == 'id' assert entityset['test_entity'].df['id'].tolist() == list(range(3))
def test_doesnt_remake_index(): # more variables df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) error_text = "Cannot make index: index variable already present" with pytest.raises(RuntimeError, match=error_text): entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', make_index=True, dataframe=df)
def test_datetime64_conversion(): df = pd.DataFrame({'id': [0, 1, 2], 'ints': ['1', '2', '1']}) df["time"] = pd.Timestamp.now() df["time"] = df["time"].astype("datetime64[ns, UTC]") entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', dataframe=df) vtype_time_index = variable_types.variable.DatetimeTimeIndex entityset['test_entity'].convert_variable_type('time', vtype_time_index)
def test_make_index_variable_ordering(): df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) vtypes = {'id': variable_types.Categorical, 'category': variable_types.Categorical} entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id1', make_index=True, variable_types=vtypes, dataframe=df) assert entityset.entity_dict['test_entity'].df.columns[0] == 'id1'
def test_extra_variable_type(): # more variables df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) vtypes = {'id': variable_types.Categorical, 'category': variable_types.Categorical, 'category2': variable_types.Categorical} with pytest.raises(LookupError): entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df)
def test_handles_datetime_mismatch(): # can't convert arbitrary strings df = pd.DataFrame({'id': [0, 1, 2], 'time': ['a', 'b', 'tomorrow']}) vtypes = { 'id': variable_types.Categorical, 'time': variable_types.Datetime } with pytest.raises(ValueError): entityset = EntitySet(id='test') entityset.entity_from_dataframe('test_entity', df, 'id', time_index='time', variable_types=vtypes)
def test_bad_index_variables(self): # more variables df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) vtypes = { 'id': variable_types.Categorical, 'category': variable_types.Categorical } with pytest.raises(LookupError): entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df, time_index='time')
def test_single_table_ks_entityset(): primitives_list = [ 'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words' ] ks_es = EntitySet(id="ks_es") df = pd.DataFrame({ "id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) values_dd = ks.from_pandas(df) vtypes = { "id": ft.variable_types.Id, "values": ft.variable_types.Numeric, "dates": ft.variable_types.Datetime, "strings": ft.variable_types.NaturalLanguage } ks_es.entity_from_dataframe(entity_id="data", dataframe=values_dd, index="id", variable_types=vtypes) ks_fm, _ = ft.dfs(entityset=ks_es, target_entity="data", trans_primitives=primitives_list) pd_es = ft.EntitySet(id="pd_es") pd_es.entity_from_dataframe( entity_id="data", dataframe=df, index="id", variable_types={"strings": ft.variable_types.NaturalLanguage}) fm, _ = ft.dfs(entityset=pd_es, target_entity="data", trans_primitives=primitives_list) ks_computed_fm = ks_fm.to_pandas().set_index('id').loc[fm.index][ fm.columns] # NUM_WORDS(strings) is int32 in koalas for some reason pd.testing.assert_frame_equal(fm, ks_computed_fm, check_dtype=False)
def test_single_table_dask_entityset(): primitives_list = [ 'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words' ] dask_es = EntitySet(id="dask_es") df = pd.DataFrame({ "id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) values_dd = dd.from_pandas(df, npartitions=2) vtypes = { "id": ft.variable_types.Id, "values": ft.variable_types.Numeric, "dates": ft.variable_types.Datetime, "strings": ft.variable_types.NaturalLanguage } dask_es.entity_from_dataframe(entity_id="data", dataframe=values_dd, index="id", variable_types=vtypes) dask_fm, _ = ft.dfs(entityset=dask_es, target_entity="data", trans_primitives=primitives_list) pd_es = ft.EntitySet(id="pd_es") pd_es.entity_from_dataframe( entity_id="data", dataframe=df, index="id", variable_types={"strings": ft.variable_types.NaturalLanguage}) fm, _ = ft.dfs(entityset=pd_es, target_entity="data", trans_primitives=primitives_list) # Use the same columns and make sure both indexes are sorted the same dask_computed_fm = dask_fm.compute().set_index('id').loc[fm.index][ fm.columns] pd.testing.assert_frame_equal(fm, dask_computed_fm)
def test_sets_time_when_adding_entity(): transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True] }) accounts_df = pd.DataFrame({ "id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)] }) accounts_df_string = pd.DataFrame({ "id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"] }) # create empty entityset entityset = EntitySet("fraud") # assert it's not set assert getattr(entityset, "time_type", None) is None # add entity entityset.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert entityset.time_type == variable_types.NumericTimeIndex # add another entity entityset.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert entityset.time_type == variable_types.NumericTimeIndex # add wrong time type entity with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_passing_strings_to_variable_types_entity_init(): variable_types = find_variable_types() reversed_variable_types = {str(v): k for k, v in variable_types.items()} reversed_variable_types['unknown variable'] = 'some unknown type string' es = EntitySet() dataframe = pd.DataFrame(columns=list(reversed_variable_types)) with pytest.warns( UserWarning, match= 'Variable type {} was unrecognized, Unknown variable type was used instead' .format('some unknown type string')): entity = Entity( 'reversed_variable_types', dataframe, es, variable_types=reversed_variable_types, index="<class 'featuretools.variable_types.variable.Index'>", time_index= "<class 'featuretools.variable_types.variable.NumericTimeIndex'>", ) reversed_variable_types["unknown variable"] = "unknown" for variable in entity.variables: variable_class = variable.__class__ assert variable_class.type_string == reversed_variable_types[ variable.id]
def test_extra_variable_type(): # more variables df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']}) vtypes = { 'id': variable_types.Categorical, 'category': variable_types.Categorical, 'category2': variable_types.Categorical } error_text = "Variable ID category2 not in DataFrame" with pytest.raises(LookupError, match=error_text): es = EntitySet(id='test') es.entity_from_dataframe(entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df)
def test_all_variable_descriptions(): es = EntitySet() dataframe = pd.DataFrame(columns=list(serialize.VARIABLE_TYPES)) es.entity_from_dataframe( 'variable_types', dataframe, index='index', time_index='datetime_time_index', variable_types=serialize.VARIABLE_TYPES, ) entity = es['variable_types'] for variable in entity.variables: description = variable.to_data_description() _variable = deserialize.description_to_variable(description, entity=entity) assert variable.__eq__(_variable)
def test_add_dataframe_from_spark_df(pd_es): cleaned_df = pd_to_spark_clean(pd_es["log"]) log_spark = ps.from_pandas(cleaned_df) spark_es = EntitySet(id="spark_es") spark_es = spark_es.add_dataframe( dataframe_name="log_spark", dataframe=log_spark, index="id", time_index="datetime", logical_types=pd_es["log"].ww.logical_types, semantic_tags=get_df_tags(pd_es["log"]), ) pd.testing.assert_frame_equal( cleaned_df, spark_es["log_spark"].to_pandas(), check_like=True )
def test_add_dataframe_with_make_index(): values = [1, 12, -23, 27] df = pd.DataFrame({"values": values}) spark_df = ps.from_pandas(df) spark_es = EntitySet(id="spark_es") ltypes = {"values": "Integer"} spark_es.add_dataframe( dataframe_name="new_dataframe", dataframe=spark_df, make_index=True, index="new_index", logical_types=ltypes, ) expected_df = pd.DataFrame({"values": values, "new_index": range(len(values))}) pd.testing.assert_frame_equal(expected_df, spark_es["new_dataframe"].to_pandas())
def test_sort_time_id(): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "transaction_time": pd.date_range(start="10:00", periods=6, freq="10s")[::-1]}) es = EntitySet("test", entities={"t": (transactions_df, "id", "transaction_time")}) times = es["t"].df.transaction_time.tolist() assert times == sorted(transactions_df.transaction_time.tolist())
def test_single_table_dask_entityset_with_instance_ids(): primitives_list = ['absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words'] instance_ids = [0, 1, 3] dask_es = EntitySet(id="dask_es") df = pd.DataFrame({"id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25')], "strings": ["I am a string", "23", "abcdef ghijk", ""]}) values_dd = dd.from_pandas(df, npartitions=2) ltypes = { "values": Integer, "dates": Datetime, "strings": NaturalLanguage } dask_es.add_dataframe( dataframe_name="data", dataframe=values_dd, index="id", logical_types=ltypes) dask_fm, _ = ft.dfs(entityset=dask_es, target_dataframe_name="data", trans_primitives=primitives_list, instance_ids=instance_ids) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe( dataframe_name="data", dataframe=df, index="id", logical_types=ltypes) fm, _ = ft.dfs(entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list, instance_ids=instance_ids) # Make sure both indexes are sorted the same pd.testing.assert_frame_equal(fm, dask_fm.compute().set_index('id').loc[fm.index])
def test_already_sorted_parameter(self): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "transaction_time": [datetime(2014, 4, 6), datetime(2012, 4, 8), datetime(2012, 4, 8), datetime(2013, 4, 8), datetime(2015, 4, 8), datetime(2016, 4, 9)]}) es = EntitySet(id='test') es.entity_from_dataframe('t', transactions_df, index='id', time_index="transaction_time", already_sorted=True) times = es["t"].df.transaction_time.tolist() assert times == transactions_df.transaction_time.tolist()
def test_single_table_ks_entityset_with_instance_ids(): primitives_list = ['absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words'] instance_ids = [0, 1, 3] ks_es = EntitySet(id="ks_es") df = pd.DataFrame({"id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25')], "strings": ["I am a string", "23", "abcdef ghijk", ""]}) values_dd = ks.from_pandas(df) vtypes = { "id": ft.variable_types.Id, "values": ft.variable_types.Numeric, "dates": ft.variable_types.Datetime, "strings": ft.variable_types.NaturalLanguage } ks_es.entity_from_dataframe(entity_id="data", dataframe=values_dd, index="id", variable_types=vtypes) ks_fm, _ = ft.dfs(entityset=ks_es, target_entity="data", trans_primitives=primitives_list, instance_ids=instance_ids) pd_es = ft.EntitySet(id="pd_es") pd_es.entity_from_dataframe(entity_id="data", dataframe=df, index="id", variable_types={"strings": ft.variable_types.NaturalLanguage}) fm, _ = ft.dfs(entityset=pd_es, target_entity="data", trans_primitives=primitives_list, instance_ids=instance_ids) # Make sure both indexes are sorted the same pd.testing.assert_frame_equal(fm, ks_fm.to_pandas().set_index('id').loc[fm.index], check_dtype=False)
def test_converts_datetime(): # string converts to datetime correctly # This test fails without defining vtypes. Entityset # infers time column should be numeric type times = pd.date_range('1/1/2011', periods=3, freq='H') time_strs = times.strftime('%Y-%m-%d') df = pd.DataFrame({'id': [0, 1, 2], 'time': time_strs}) vtypes = {'id': variable_types.Categorical, 'time': variable_types.Datetime} entityset = EntitySet(id='test') entityset._import_from_dataframe(entity_id='test_entity', index='id', time_index="time", variable_types=vtypes, dataframe=df) pd_col = entityset['test_entity'].df['time'] # assert type(entityset['test_entity']['time']) == variable_types.Datetime assert type(pd_col[0]) == pd.Timestamp
def test_entity_descriptions(es): _es = EntitySet(es.id) for entity in es.metadata.entities: description = serialize.entity_to_description(entity) deserialize.description_to_entity(description, _es) _entity = _es[description['id']] _entity.last_time_index = entity.last_time_index assert entity.__eq__(_entity, deep=True)
def test_single_table_ks_entityset_ids_not_sorted(): primitives_list = [ 'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words' ] ks_es = EntitySet(id="ks_es") df = pd.DataFrame({ "id": [2, 0, 1, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) values_dd = ks.from_pandas(df) ltypes = { "values": Integer, "dates": Datetime, "strings": NaturalLanguage, } ks_es.add_dataframe(dataframe_name="data", dataframe=values_dd, index="id", logical_types=ltypes) ks_fm, _ = ft.dfs(entityset=ks_es, target_dataframe_name="data", trans_primitives=primitives_list) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe(dataframe_name="data", dataframe=df, index="id", logical_types=ltypes) fm, _ = ft.dfs(entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list) ks_computed_fm = ks_fm.to_pandas().set_index('id').loc[fm.index] # Koalas dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing pd.testing.assert_frame_equal(fm.astype(ks_computed_fm.dtypes), ks_computed_fm)
def test_converts_datetime(self): # string converts to datetime correctly # This test fails without defining vtypes. Entityset # infers time column should be numeric type times = pd.date_range('1/1/2011', periods=3, freq='H') time_strs = times.strftime('%Y-%m-%d') df = pd.DataFrame({'id': [0, 1, 2], 'time': time_strs}) vtypes = {'id': variable_types.Categorical, 'time': variable_types.Datetime} entityset = EntitySet(id='test') entityset._import_from_dataframe(entity_id='test_entity', index='id', time_index="time", variable_types=vtypes, dataframe=df) pd_col = entityset.get_column_data('test_entity', 'time') # assert type(es['test_entity']['time']) == variable_types.Datetime assert type(pd_col[0]) == pd.Timestamp
def make_ecommerce_entityset(with_integer_time_index=False): """ Makes a entityset with the following shape: R Regions / \\ . S C Stores, Customers | . S P Sessions, Products \\ / . L Log """ dataframes = make_ecommerce_dataframes( with_integer_time_index=with_integer_time_index) entities = dataframes.keys() es_id = 'ecommerce' if with_integer_time_index: es_id += "_int_time_index" variable_types = make_variable_types( with_integer_time_index=with_integer_time_index) time_indexes = make_time_indexes( with_integer_time_index=with_integer_time_index) es = EntitySet(id=es_id) for entity in entities: time_index = time_indexes.get(entity, None) ti_name = None secondary = None if time_index is not None: ti_name = time_index['name'] secondary = time_index['secondary'] df = dataframes[entity] es.entity_from_dataframe(entity, df, index='id', variable_types=variable_types[entity], time_index=ti_name, secondary_time_index=secondary) es.normalize_entity('customers', 'cohorts', 'cohort', additional_variables=['cohort_name'], make_time_index=True, new_entity_time_index='cohort_end') es.add_relationships([ Relationship(es[u'régions']['id'], es['customers'][u'région_id']), Relationship(es[u'régions']['id'], es['stores'][u'région_id']), Relationship(es['customers']['id'], es['sessions']['customer_id']), Relationship(es['sessions']['id'], es['log']['session_id']), Relationship(es['products']['id'], es['log']['product_id']) ]) return es
def make_ecommerce_entityset(with_integer_time_index=False): """ Makes a entityset with the following shape: R Regions / \\ . S C Stores, Customers | . S P Sessions, Products \\ / . L Log """ dataframes = make_ecommerce_dataframes( with_integer_time_index=with_integer_time_index) dataframe_names = dataframes.keys() es_id = 'ecommerce' if with_integer_time_index: es_id += "_int_time_index" logical_types = make_logical_types( with_integer_time_index=with_integer_time_index) semantic_tags = make_semantic_tags() time_indexes = make_time_indexes( with_integer_time_index=with_integer_time_index) es = EntitySet(id=es_id) for df_name in dataframe_names: time_index = time_indexes.get(df_name, None) ti_name = None secondary = None if time_index is not None: ti_name = time_index['name'] secondary = time_index['secondary'] df = dataframes[df_name] es.add_dataframe(df, dataframe_name=df_name, index='id', logical_types=logical_types[df_name], semantic_tags=semantic_tags[df_name], time_index=ti_name, secondary_time_index=secondary) es.normalize_dataframe('customers', 'cohorts', 'cohort', additional_columns=['cohort_name'], make_time_index=True, new_dataframe_time_index='cohort_end') es.add_relationships([(u'régions', 'id', 'customers', u'région_id'), (u'régions', 'id', 'stores', u'région_id'), ('customers', 'id', 'sessions', 'customer_id'), ('sessions', 'id', 'log', 'session_id'), ('products', 'id', 'log', 'product_id')]) return es
def test_custom_variable_descriptions(): class ItemList(Categorical): type_string = "item_list" _default_pandas_dtype = list es = EntitySet() variables = {'item_list': ItemList, 'time_index': TimeIndex, 'index': Index} dataframe = pd.DataFrame(columns=list(variables)) es.entity_from_dataframe( 'custom_variable', dataframe, index='index', time_index='time_index', variable_types=variables) entity = es['custom_variable'] for variable in entity.variables: description = variable.to_data_description() _variable = deserialize.description_to_variable(description, entity=entity) assert variable.__eq__(_variable)
def datetime_es(): cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5], "card_id": [1, 1, 5, 1, 5], "transaction_time": pd.to_datetime([ '2011-2-28 04:00', '2012-2-28 05:00', '2012-2-29 06:00', '2012-3-1 08:00', '2014-4-1 10:00' ]), "fraud": [True, False, False, False, True] }) datetime_es = EntitySet(id="fraud_data") datetime_es = datetime_es.entity_from_dataframe( entity_id="transactions", dataframe=transactions_df, index="id", time_index="transaction_time") datetime_es = datetime_es.entity_from_dataframe(entity_id="cards", dataframe=cards_df, index="id") relationship = Relationship(datetime_es["cards"]["id"], datetime_es["transactions"]["card_id"]) datetime_es = datetime_es.add_relationship(relationship) datetime_es.add_last_time_indexes() return datetime_es
def test_single_table_ks_entityset_dates_not_sorted(): ks_es = EntitySet(id="ks_es") df = pd.DataFrame({ "id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ] }) primitives_list = ['absolute', 'is_weekend', 'year', 'day'] values_dd = ks.from_pandas(df) vtypes = { "id": ft.variable_types.Id, "values": ft.variable_types.Numeric, "dates": ft.variable_types.Datetime, } ks_es.entity_from_dataframe(entity_id="data", dataframe=values_dd, index="id", time_index="dates", variable_types=vtypes) ks_fm, _ = ft.dfs(entityset=ks_es, target_entity="data", trans_primitives=primitives_list, max_depth=1) pd_es = ft.EntitySet(id="pd_es") pd_es.entity_from_dataframe(entity_id="data", dataframe=df, index="id", time_index="dates") fm, _ = ft.dfs(entityset=pd_es, target_entity="data", trans_primitives=primitives_list, max_depth=1) pd.testing.assert_frame_equal( fm, ks_fm.to_pandas().set_index('id').loc[fm.index])
def test_sets_time_when_adding_entity(): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True]}) accounts_df = pd.DataFrame({"id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)]}) accounts_df_string = pd.DataFrame({"id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"]}) # create empty entityset es = EntitySet("fraud") # assert it's not set assert getattr(es, "time_type", None) is None # add entity es.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert es.time_type == variable_types.NumericTimeIndex # add another entity es.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert es.time_type == variable_types.NumericTimeIndex # add wrong time type entity error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error_text): es.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index error_text = "Attempted to convert all string column signup_date to numeric" with pytest.raises(TypeError, match=error_text): es.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_calculates_statistics_on_init(self): df = pd.DataFrame({ 'id': [0, 1, 2], 'time': [datetime(2011, 4, 9, 10, 31, 3 * i) for i in range(3)], 'category': ['a', 'b', 'a'], 'number': [4, 5, 6], 'boolean': [True, False, True], 'boolean_with_nan': [True, False, np.nan] }) vtypes = { 'id': variable_types.Categorical, 'time': variable_types.Datetime, 'category': variable_types.Categorical, 'number': variable_types.Numeric, 'boolean': variable_types.Boolean, 'boolean_with_nan': variable_types.Boolean } entityset = EntitySet(id='test') entityset.entity_from_dataframe('stats_test_entity', df, 'id', variable_types=vtypes) e = entityset["stats_test_entity"] # numerics don't have nunique or percent_unique defined for v in ['time', 'category', 'number']: assert e[v].count == 3 for v in ['time', 'number']: with pytest.raises(AttributeError): e[v].nunique with pytest.raises(AttributeError): e[v].percent_unique # 'id' column automatically parsed as id assert e['id'].count == 3 # categoricals have nunique and percent_unique defined assert e['category'].nunique == 2 assert e['category'].percent_unique == 2. / 3 # booleans have count and number of true/false labels defined assert e['boolean'].count == 3 # assert e['boolean'].num_true == 3 assert e['boolean'].num_true == 2 assert e['boolean'].num_false == 1
def datetime_es(): cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame( { "id": [1, 2, 3, 4, 5], "card_id": [1, 1, 5, 1, 5], "transaction_time": pd.to_datetime( [ "2011-2-28 04:00", "2012-2-28 05:00", "2012-2-29 06:00", "2012-3-1 08:00", "2014-4-1 10:00", ] ), "fraud": [True, False, False, False, True], } ) datetime_es = EntitySet(id="fraud_data") datetime_es = datetime_es.add_dataframe( dataframe_name="transactions", dataframe=transactions_df, index="id", time_index="transaction_time", ) datetime_es = datetime_es.add_dataframe( dataframe_name="cards", dataframe=cards_df, index="id" ) datetime_es = datetime_es.add_relationship("cards", "id", "transactions", "card_id") datetime_es.add_last_time_indexes() return datetime_es
def test_serialization(entityset): dirname = os.path.dirname(integration_data.__file__) path = os.path.join(dirname, 'test_entityset.p') if os.path.exists(path): shutil.rmtree(path) entityset.to_pickle(path) new_es = EntitySet.read_pickle(path) assert entityset.__eq__(new_es, deep=True) shutil.rmtree(path)
def test_create_entity_with_make_index(): values = [1, 12, -23, 27] df = pd.DataFrame({"values": values}) dask_df = dd.from_pandas(df, npartitions=2) dask_es = EntitySet(id="dask_es") vtypes = {"values": ft.variable_types.Numeric} dask_es.entity_from_dataframe(entity_id="new_entity", dataframe=dask_df, make_index=True, index="new_index", variable_types=vtypes) expected_df = pd.DataFrame({ "new_index": range(len(values)), "values": values }) pd.testing.assert_frame_equal(expected_df, dask_es['new_entity'].df.compute())
def test_checks_time_type_setting_secondary_time_index(self, entityset): # entityset is timestamp time type assert entityset.time_type == variable_types.DatetimeTimeIndex # add secondary index that is timestamp type new_2nd_ti = {'upgrade_date': ['upgrade_date', 'favorite_quote'], 'cancel_date': ['cancel_date', 'cancel_reason']} entityset["customers"].set_secondary_time_index(new_2nd_ti) assert entityset.time_type == variable_types.DatetimeTimeIndex # add secondary index that is numeric type new_2nd_ti = {'age': ['age', 'loves_ice_cream']} with pytest.raises(TypeError): entityset["customers"].set_secondary_time_index(new_2nd_ti) # add secondary index that is non-time type new_2nd_ti = {'favorite_quote': ['favorite_quote', 'loves_ice_cream']} with pytest.raises(TypeError): entityset["customers"].set_secondary_time_index(new_2nd_ti) # add mismatched pair of secondary time indexes new_2nd_ti = {'upgrade_date': ['upgrade_date', 'favorite_quote'], 'age': ['age', 'loves_ice_cream']} with pytest.raises(TypeError): entityset["customers"].set_secondary_time_index(new_2nd_ti) # create entityset with numeric time type cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]}) transactions_df = pd.DataFrame({ "id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud_decision_time": [11, 14, 15, 21, 22, 21], "transaction_city": ["City A"] * 6, "transaction_date": [datetime(1989, 2, i) for i in range(1, 7)], "fraud": [True, False, False, False, True, True] }) entities = { "cards": (cards_df, "id"), "transactions": (transactions_df, "id", "transaction_time") } relationships = [("cards", "id", "transactions", "card_id")] card_es = EntitySet("fraud", entities, relationships) assert card_es.time_type == variable_types.NumericTimeIndex # add secondary index that is numeric time type new_2nd_ti = {'fraud_decision_time': ['fraud_decision_time', 'fraud']} card_es['transactions'].set_secondary_time_index(new_2nd_ti) assert card_es.time_type == variable_types.NumericTimeIndex # add secondary index that is timestamp type new_2nd_ti = {'transaction_date': ['transaction_date', 'fraud']} with pytest.raises(TypeError): card_es['transactions'].set_secondary_time_index(new_2nd_ti) # add secondary index that is non-time type new_2nd_ti = {'transaction_city': ['transaction_city', 'fraud']} with pytest.raises(TypeError): card_es['transactions'].set_secondary_time_index(new_2nd_ti) # add mixed secondary time indexes new_2nd_ti = {'transaction_city': ['transaction_city', 'fraud'], 'fraud_decision_time': ['fraud_decision_time', 'fraud']} with pytest.raises(TypeError): card_es['transactions'].set_secondary_time_index(new_2nd_ti)
def test_add_dataframe_with_make_index(): values = [1, 12, -23, 27] df = pd.DataFrame({"values": values}) dask_df = dd.from_pandas(df, npartitions=2) dask_es = EntitySet(id="dask_es") logical_types = {"values": Integer} dask_es.add_dataframe(dataframe_name="new_dataframe", dataframe=dask_df, make_index=True, index="new_index", logical_types=logical_types) expected_df = pd.DataFrame({ "values": values, "new_index": range(len(values)) }) pd.testing.assert_frame_equal(expected_df, dask_es['new_dataframe'].compute())
def test_single_table_ks_entityset_dates_not_sorted(): ks_es = EntitySet(id="ks_es") df = pd.DataFrame({ "id": [0, 1, 2, 3], "values": [1, 12, -34, 27], "dates": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ] }) primitives_list = ['absolute', 'is_weekend', 'year', 'day'] values_dd = ks.from_pandas(df) ltypes = { "values": Integer, "dates": Datetime, } ks_es.add_dataframe(dataframe_name="data", dataframe=values_dd, index="id", time_index="dates", logical_types=ltypes) ks_fm, _ = ft.dfs(entityset=ks_es, target_dataframe_name="data", trans_primitives=primitives_list, max_depth=1) pd_es = ft.EntitySet(id="pd_es") pd_es.add_dataframe(dataframe_name="data", dataframe=df, index="id", time_index="dates", logical_types=ltypes) fm, _ = ft.dfs(entityset=pd_es, target_dataframe_name="data", trans_primitives=primitives_list, max_depth=1) ks_fm = ks_fm.to_pandas().set_index('id').loc[fm.index] pd.testing.assert_frame_equal(fm.astype(ks_fm.dtypes), ks_fm)
def test_make_time_index_keeps_original_sorting(): trips = { 'trip_id': [999 - i for i in range(1000)], 'flight_time': [datetime(1997, 4, 1) for i in range(1000)], 'flight_id': [1 for i in range(350)] + [2 for i in range(650)] } order = [i for i in range(1000)] df = pd.DataFrame.from_dict(trips) es = EntitySet('flights') es.entity_from_dataframe("trips", dataframe=df, index="trip_id", time_index='flight_time') assert (es['trips'].df['trip_id'] == order).all() es.normalize_entity(base_entity_id="trips", new_entity_id="flights", index="flight_id", make_time_index=True) assert (es['trips'].df['trip_id'] == order).all()
def test_already_sorted_parameter(): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "transaction_time": [datetime(2014, 4, 6), datetime( 2012, 4, 8), datetime( 2012, 4, 8), datetime( 2013, 4, 8), datetime( 2015, 4, 8), datetime(2016, 4, 9)]}) es = EntitySet(id='test') es.entity_from_dataframe('t', transactions_df, index='id', time_index="transaction_time", already_sorted=True) times = es["t"].df.transaction_time.tolist() assert times == transactions_df.transaction_time.tolist()
def test_sets_time_when_adding_entity(): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True]}) accounts_df = pd.DataFrame({"id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)]}) accounts_df_string = pd.DataFrame({"id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"]}) # create empty entityset entityset = EntitySet("fraud") # assert it's not set assert getattr(entityset, "time_type", None) is None # add entity entityset.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert entityset.time_type == variable_types.NumericTimeIndex # add another entity entityset.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert entityset.time_type == variable_types.NumericTimeIndex # add wrong time type entity error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes" with pytest.raises(TypeError, match=error_text): entityset.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index error_text = "Attempted to convert all string column signup_date to numeric" with pytest.raises(TypeError, match=error_text): entityset.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")
def test_calculates_statistics_on_init(self): df = pd.DataFrame({'id': [0, 1, 2], 'time': [datetime(2011, 4, 9, 10, 31, 3 * i) for i in range(3)], 'category': ['a', 'b', 'a'], 'number': [4, 5, 6], 'boolean': [True, False, True], 'boolean_with_nan': [True, False, np.nan]}) vtypes = {'id': variable_types.Categorical, 'time': variable_types.Datetime, 'category': variable_types.Categorical, 'number': variable_types.Numeric, 'boolean': variable_types.Boolean, 'boolean_with_nan': variable_types.Boolean} entityset = EntitySet(id='test') entityset.entity_from_dataframe('stats_test_entity', df, 'id', variable_types=vtypes) e = entityset["stats_test_entity"] # numerics don't have nunique or percent_unique defined for v in ['time', 'category', 'number']: assert e[v].count == 3 for v in ['time', 'number']: with pytest.raises(AttributeError): e[v].nunique with pytest.raises(AttributeError): e[v].percent_unique # 'id' column automatically parsed as id assert e['id'].count == 3 # categoricals have nunique and percent_unique defined assert e['category'].nunique == 2 assert e['category'].percent_unique == 2. / 3 # booleans have count and number of true/false labels defined assert e['boolean'].count == 3 # assert e['boolean'].num_true == 3 assert e['boolean'].num_true == 2 assert e['boolean'].num_false == 1
def test_converts_variable_types_on_init(): df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a'], 'category_int': [1, 2, 3], 'ints': ['1', '2', '3'], 'floats': ['1', '2', '3.0']}) df["category_int"] = df["category_int"].astype("category") vtypes = {'id': variable_types.Categorical, 'ints': variable_types.Numeric, 'floats': variable_types.Numeric} entityset = EntitySet(id='test') entityset.entity_from_dataframe(entity_id='test_entity', index='id', variable_types=vtypes, dataframe=df) entity_df = entityset['test_entity'].df assert entity_df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics assert entity_df['floats'].dtype.name in variable_types.PandasTypes._pandas_numerics # this is infer from pandas dtype e = entityset["test_entity"] assert isinstance(e['category_int'], variable_types.Categorical)
def test_sets_time_when_adding_entity(self): transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "card_id": [1, 2, 1, 3, 4, 5], "transaction_time": [10, 12, 13, 20, 21, 20], "fraud": [True, False, False, False, True, True]}) accounts_df = pd.DataFrame({"id": [3, 4, 5], "signup_date": [datetime(2002, 5, 1), datetime(2006, 3, 20), datetime(2011, 11, 11)]}) accounts_df_string = pd.DataFrame({"id": [3, 4, 5], "signup_date": ["element", "exporting", "editable"]}) # create empty entityset entityset = EntitySet("fraud") # assert it's not set assert getattr(entityset, "time_type", None) is None # add entity entityset.entity_from_dataframe("transactions", transactions_df, index="id", time_index="transaction_time") # assert time_type is set assert entityset.time_type == variable_types.NumericTimeIndex # add another entity entityset.normalize_entity("transactions", "cards", "card_id", make_time_index=True) # assert time_type unchanged assert entityset.time_type == variable_types.NumericTimeIndex # add wrong time type entity with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df, index="id", time_index="signup_date") # add non time type as time index with pytest.raises(TypeError): entityset.entity_from_dataframe("accounts", accounts_df_string, index="id", time_index="signup_date")