예제 #1
0
def test_handles_datetime_format():
    # check if we load according to the format string
    # pass in an ambigious date
    datetime_format = "%d-%m-%Y"
    actual = pd.Timestamp('Jan 2, 2011')
    time_strs = [actual.strftime(datetime_format)] * 3
    df = pd.DataFrame(
        {'id': [0, 1, 2], 'time_format': time_strs, 'time_no_format': time_strs})
    vtypes = {'id': variable_types.Categorical,
              'time_format': (variable_types.Datetime, {"format": datetime_format}),
              'time_no_format': variable_types.Datetime}

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(
        entity_id='test_entity',
        index='id',
        variable_types=vtypes,
        dataframe=df)

    col_format = entityset['test_entity'].df['time_format']
    col_no_format = entityset['test_entity'].df['time_no_format']
    # without formatting pandas gets it wrong
    assert (col_no_format != actual).all()

    # with formatting we correctly get jan2
    assert (col_format == actual).all()
예제 #2
0
def test_converts_variable_type_after_init():
    df = pd.DataFrame({'id': [0, 1, 2],
                       'category': ['a', 'b', 'a'],
                       'ints': ['1', '2', '1']})

    df["category"] = df["category"].astype("category")

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity', index='id',
                                    dataframe=df)
    e = entityset['test_entity']
    df = entityset['test_entity'].df

    e.convert_variable_type('ints', variable_types.Numeric)
    assert isinstance(e['ints'], variable_types.Numeric)
    assert df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics

    e.convert_variable_type('ints', variable_types.Categorical)
    assert isinstance(e['ints'], variable_types.Categorical)

    e.convert_variable_type('ints', variable_types.Ordinal)
    assert isinstance(e['ints'], variable_types.Ordinal)

    e.convert_variable_type('ints', variable_types.Boolean,
                            true_val=1, false_val=2)
    assert isinstance(e['ints'], variable_types.Boolean)
    assert df['ints'].dtype.name == 'bool'
예제 #3
0
def test_check_variables_and_dataframe():
    # matches
    df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})
    vtypes = {'id': variable_types.Categorical,
              'category': variable_types.Categorical}
    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe('test_entity', df, index='id',
                                    variable_types=vtypes)
    assert entityset.entity_dict['test_entity'].variable_types['category'] == variable_types.Categorical
예제 #4
0
def test_bad_time_index_variable():
    df = pd.DataFrame({'category': ['a', 'b', 'a']})

    error_text = "Time index not found in dataframe"
    with pytest.raises(LookupError, match=error_text):
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe(entity_id='test_entity',
                                        index="id",
                                        dataframe=df,
                                        time_index='time')
예제 #5
0
def test_none_index():
    df = pd.DataFrame({'category': [1, 2, 3], 'category2': ['1', '2', '3']})
    vtypes = {'category': variable_types.Categorical, 'category2': variable_types.Categorical}

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity',
                                    dataframe=df,
                                    variable_types=vtypes)
    assert entityset['test_entity'].index == 'category'
    assert isinstance(entityset['test_entity']['category'], variable_types.Index)
예제 #6
0
def test_handles_datetime_mismatch():
    # can't convert arbitrary strings
    df = pd.DataFrame({'id': [0, 1, 2], 'time': ['a', 'b', 'tomorrow']})
    vtypes = {'id': variable_types.Categorical,
              'time': variable_types.Datetime}

    with pytest.raises(ValueError):
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe('test_entity', df, 'id',
                                        time_index='time', variable_types=vtypes)
예제 #7
0
def test_unknown_index():
    # more variables
    df = pd.DataFrame({'category': ['a', 'b', 'a']})
    vtypes = {'category': variable_types.Categorical}

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity',
                                    index='id',
                                    variable_types=vtypes, dataframe=df)
    assert entityset['test_entity'].index == 'id'
    assert entityset['test_entity'].df['id'].tolist() == list(range(3))
예제 #8
0
def test_doesnt_remake_index():
    # more variables
    df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})

    error_text = "Cannot make index: index variable already present"
    with pytest.raises(RuntimeError, match=error_text):
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe(entity_id='test_entity',
                                        index='id',
                                        make_index=True,
                                        dataframe=df)
예제 #9
0
def test_datetime64_conversion():
    df = pd.DataFrame({'id': [0, 1, 2],
                       'ints': ['1', '2', '1']})
    df["time"] = pd.Timestamp.now()
    df["time"] = df["time"].astype("datetime64[ns, UTC]")

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity', index='id',
                                    dataframe=df)
    vtype_time_index = variable_types.variable.DatetimeTimeIndex
    entityset['test_entity'].convert_variable_type('time', vtype_time_index)
예제 #10
0
def test_make_index_variable_ordering():
    df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})
    vtypes = {'id': variable_types.Categorical,
              'category': variable_types.Categorical}

    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity',
                                    index='id1',
                                    make_index=True,
                                    variable_types=vtypes,
                                    dataframe=df)
    assert entityset.entity_dict['test_entity'].df.columns[0] == 'id1'
예제 #11
0
def test_extra_variable_type():
    # more variables
    df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})
    vtypes = {'id': variable_types.Categorical,
              'category': variable_types.Categorical,
              'category2': variable_types.Categorical}

    with pytest.raises(LookupError):
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe(entity_id='test_entity',
                                        index='id',
                                        variable_types=vtypes, dataframe=df)
def test_handles_datetime_mismatch():
    # can't convert arbitrary strings
    df = pd.DataFrame({'id': [0, 1, 2], 'time': ['a', 'b', 'tomorrow']})
    vtypes = {
        'id': variable_types.Categorical,
        'time': variable_types.Datetime
    }

    with pytest.raises(ValueError):
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe('test_entity',
                                        df,
                                        'id',
                                        time_index='time',
                                        variable_types=vtypes)
예제 #13
0
    def test_bad_index_variables(self):
        # more variables
        df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})
        vtypes = {
            'id': variable_types.Categorical,
            'category': variable_types.Categorical
        }

        with pytest.raises(LookupError):
            entityset = EntitySet(id='test')
            entityset.entity_from_dataframe(entity_id='test_entity',
                                            index='id',
                                            variable_types=vtypes,
                                            dataframe=df,
                                            time_index='time')
예제 #14
0
def test_single_table_ks_entityset():
    primitives_list = [
        'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words'
    ]

    ks_es = EntitySet(id="ks_es")
    df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    values_dd = ks.from_pandas(df)
    vtypes = {
        "id": ft.variable_types.Id,
        "values": ft.variable_types.Numeric,
        "dates": ft.variable_types.Datetime,
        "strings": ft.variable_types.NaturalLanguage
    }
    ks_es.entity_from_dataframe(entity_id="data",
                                dataframe=values_dd,
                                index="id",
                                variable_types=vtypes)

    ks_fm, _ = ft.dfs(entityset=ks_es,
                      target_entity="data",
                      trans_primitives=primitives_list)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.entity_from_dataframe(
        entity_id="data",
        dataframe=df,
        index="id",
        variable_types={"strings": ft.variable_types.NaturalLanguage})

    fm, _ = ft.dfs(entityset=pd_es,
                   target_entity="data",
                   trans_primitives=primitives_list)

    ks_computed_fm = ks_fm.to_pandas().set_index('id').loc[fm.index][
        fm.columns]
    # NUM_WORDS(strings) is int32 in koalas for some reason
    pd.testing.assert_frame_equal(fm, ks_computed_fm, check_dtype=False)
예제 #15
0
def test_single_table_dask_entityset():
    primitives_list = [
        'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words'
    ]

    dask_es = EntitySet(id="dask_es")
    df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    values_dd = dd.from_pandas(df, npartitions=2)
    vtypes = {
        "id": ft.variable_types.Id,
        "values": ft.variable_types.Numeric,
        "dates": ft.variable_types.Datetime,
        "strings": ft.variable_types.NaturalLanguage
    }
    dask_es.entity_from_dataframe(entity_id="data",
                                  dataframe=values_dd,
                                  index="id",
                                  variable_types=vtypes)

    dask_fm, _ = ft.dfs(entityset=dask_es,
                        target_entity="data",
                        trans_primitives=primitives_list)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.entity_from_dataframe(
        entity_id="data",
        dataframe=df,
        index="id",
        variable_types={"strings": ft.variable_types.NaturalLanguage})

    fm, _ = ft.dfs(entityset=pd_es,
                   target_entity="data",
                   trans_primitives=primitives_list)

    # Use the same columns and make sure both indexes are sorted the same
    dask_computed_fm = dask_fm.compute().set_index('id').loc[fm.index][
        fm.columns]
    pd.testing.assert_frame_equal(fm, dask_computed_fm)
예제 #16
0
def test_sets_time_when_adding_entity():
    transactions_df = pd.DataFrame({
        "id": [1, 2, 3, 4, 5, 6],
        "card_id": [1, 2, 1, 3, 4, 5],
        "transaction_time": [10, 12, 13, 20, 21, 20],
        "fraud": [True, False, False, False, True, True]
    })
    accounts_df = pd.DataFrame({
        "id": [3, 4, 5],
        "signup_date":
        [datetime(2002, 5, 1),
         datetime(2006, 3, 20),
         datetime(2011, 11, 11)]
    })
    accounts_df_string = pd.DataFrame({
        "id": [3, 4, 5],
        "signup_date": ["element", "exporting", "editable"]
    })
    # create empty entityset
    entityset = EntitySet("fraud")
    # assert it's not set
    assert getattr(entityset, "time_type", None) is None
    # add entity
    entityset.entity_from_dataframe("transactions",
                                    transactions_df,
                                    index="id",
                                    time_index="transaction_time")
    # assert time_type is set
    assert entityset.time_type == variable_types.NumericTimeIndex
    # add another entity
    entityset.normalize_entity("transactions",
                               "cards",
                               "card_id",
                               make_time_index=True)
    # assert time_type unchanged
    assert entityset.time_type == variable_types.NumericTimeIndex
    # add wrong time type entity
    with pytest.raises(TypeError):
        entityset.entity_from_dataframe("accounts",
                                        accounts_df,
                                        index="id",
                                        time_index="signup_date")
    # add non time type as time index
    with pytest.raises(TypeError):
        entityset.entity_from_dataframe("accounts",
                                        accounts_df_string,
                                        index="id",
                                        time_index="signup_date")
예제 #17
0
def test_passing_strings_to_variable_types_entity_init():
    variable_types = find_variable_types()
    reversed_variable_types = {str(v): k for k, v in variable_types.items()}
    reversed_variable_types['unknown variable'] = 'some unknown type string'

    es = EntitySet()
    dataframe = pd.DataFrame(columns=list(reversed_variable_types))
    with pytest.warns(
            UserWarning,
            match=
            'Variable type {} was unrecognized, Unknown variable type was used instead'
            .format('some unknown type string')):
        entity = Entity(
            'reversed_variable_types',
            dataframe,
            es,
            variable_types=reversed_variable_types,
            index="<class 'featuretools.variable_types.variable.Index'>",
            time_index=
            "<class 'featuretools.variable_types.variable.NumericTimeIndex'>",
        )

    reversed_variable_types["unknown variable"] = "unknown"
    for variable in entity.variables:
        variable_class = variable.__class__
        assert variable_class.type_string == reversed_variable_types[
            variable.id]
예제 #18
0
def test_extra_variable_type():
    # more variables
    df = pd.DataFrame({'id': [0, 1, 2], 'category': ['a', 'b', 'a']})
    vtypes = {
        'id': variable_types.Categorical,
        'category': variable_types.Categorical,
        'category2': variable_types.Categorical
    }

    error_text = "Variable ID category2 not in DataFrame"
    with pytest.raises(LookupError, match=error_text):
        es = EntitySet(id='test')
        es.entity_from_dataframe(entity_id='test_entity',
                                 index='id',
                                 variable_types=vtypes,
                                 dataframe=df)
예제 #19
0
def test_all_variable_descriptions():
    es = EntitySet()
    dataframe = pd.DataFrame(columns=list(serialize.VARIABLE_TYPES))
    es.entity_from_dataframe(
        'variable_types',
        dataframe,
        index='index',
        time_index='datetime_time_index',
        variable_types=serialize.VARIABLE_TYPES,
    )
    entity = es['variable_types']
    for variable in entity.variables:
        description = variable.to_data_description()
        _variable = deserialize.description_to_variable(description,
                                                        entity=entity)
        assert variable.__eq__(_variable)
예제 #20
0
def test_add_dataframe_from_spark_df(pd_es):
    cleaned_df = pd_to_spark_clean(pd_es["log"])
    log_spark = ps.from_pandas(cleaned_df)

    spark_es = EntitySet(id="spark_es")
    spark_es = spark_es.add_dataframe(
        dataframe_name="log_spark",
        dataframe=log_spark,
        index="id",
        time_index="datetime",
        logical_types=pd_es["log"].ww.logical_types,
        semantic_tags=get_df_tags(pd_es["log"]),
    )
    pd.testing.assert_frame_equal(
        cleaned_df, spark_es["log_spark"].to_pandas(), check_like=True
    )
예제 #21
0
def test_add_dataframe_with_make_index():
    values = [1, 12, -23, 27]
    df = pd.DataFrame({"values": values})
    spark_df = ps.from_pandas(df)
    spark_es = EntitySet(id="spark_es")
    ltypes = {"values": "Integer"}
    spark_es.add_dataframe(
        dataframe_name="new_dataframe",
        dataframe=spark_df,
        make_index=True,
        index="new_index",
        logical_types=ltypes,
    )

    expected_df = pd.DataFrame({"values": values, "new_index": range(len(values))})
    pd.testing.assert_frame_equal(expected_df, spark_es["new_dataframe"].to_pandas())
예제 #22
0
def test_sort_time_id():
    transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                    "transaction_time": pd.date_range(start="10:00", periods=6, freq="10s")[::-1]})

    es = EntitySet("test", entities={"t": (transactions_df, "id", "transaction_time")})
    times = es["t"].df.transaction_time.tolist()
    assert times == sorted(transactions_df.transaction_time.tolist())
예제 #23
0
def test_single_table_dask_entityset_with_instance_ids():
    primitives_list = ['absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words']
    instance_ids = [0, 1, 3]

    dask_es = EntitySet(id="dask_es")
    df = pd.DataFrame({"id": [0, 1, 2, 3],
                       "values": [1, 12, -34, 27],
                       "dates": [pd.to_datetime('2019-01-10'),
                                 pd.to_datetime('2019-02-03'),
                                 pd.to_datetime('2019-01-01'),
                                 pd.to_datetime('2017-08-25')],
                       "strings": ["I am a string",
                                   "23",
                                   "abcdef ghijk",
                                   ""]})

    values_dd = dd.from_pandas(df, npartitions=2)
    ltypes = {
        "values": Integer,
        "dates": Datetime,
        "strings": NaturalLanguage
    }
    dask_es.add_dataframe(
        dataframe_name="data",
        dataframe=values_dd,
        index="id",
        logical_types=ltypes)

    dask_fm, _ = ft.dfs(entityset=dask_es,
                        target_dataframe_name="data",
                        trans_primitives=primitives_list,
                        instance_ids=instance_ids)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(
        dataframe_name="data",
        dataframe=df,
        index="id",
        logical_types=ltypes)

    fm, _ = ft.dfs(entityset=pd_es,
                   target_dataframe_name="data",
                   trans_primitives=primitives_list,
                   instance_ids=instance_ids)

    # Make sure both indexes are sorted the same
    pd.testing.assert_frame_equal(fm, dask_fm.compute().set_index('id').loc[fm.index])
예제 #24
0
    def test_already_sorted_parameter(self):
        transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                        "transaction_time": [datetime(2014, 4, 6),
                                                             datetime(2012, 4, 8),
                                                             datetime(2012, 4, 8),
                                                             datetime(2013, 4, 8),
                                                             datetime(2015, 4, 8),
                                                             datetime(2016, 4, 9)]})

        es = EntitySet(id='test')
        es.entity_from_dataframe('t',
                                 transactions_df,
                                 index='id',
                                 time_index="transaction_time",
                                 already_sorted=True)
        times = es["t"].df.transaction_time.tolist()
        assert times == transactions_df.transaction_time.tolist()
예제 #25
0
def test_single_table_ks_entityset_with_instance_ids():
    primitives_list = ['absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words']
    instance_ids = [0, 1, 3]

    ks_es = EntitySet(id="ks_es")
    df = pd.DataFrame({"id": [0, 1, 2, 3],
                       "values": [1, 12, -34, 27],
                       "dates": [pd.to_datetime('2019-01-10'),
                                 pd.to_datetime('2019-02-03'),
                                 pd.to_datetime('2019-01-01'),
                                 pd.to_datetime('2017-08-25')],
                       "strings": ["I am a string",
                                   "23",
                                   "abcdef ghijk",
                                   ""]})

    values_dd = ks.from_pandas(df)
    vtypes = {
        "id": ft.variable_types.Id,
        "values": ft.variable_types.Numeric,
        "dates": ft.variable_types.Datetime,
        "strings": ft.variable_types.NaturalLanguage
    }
    ks_es.entity_from_dataframe(entity_id="data",
                                dataframe=values_dd,
                                index="id",
                                variable_types=vtypes)

    ks_fm, _ = ft.dfs(entityset=ks_es,
                      target_entity="data",
                      trans_primitives=primitives_list,
                      instance_ids=instance_ids)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.entity_from_dataframe(entity_id="data",
                                dataframe=df,
                                index="id",
                                variable_types={"strings": ft.variable_types.NaturalLanguage})

    fm, _ = ft.dfs(entityset=pd_es,
                   target_entity="data",
                   trans_primitives=primitives_list,
                   instance_ids=instance_ids)

    # Make sure both indexes are sorted the same
    pd.testing.assert_frame_equal(fm, ks_fm.to_pandas().set_index('id').loc[fm.index], check_dtype=False)
예제 #26
0
def test_converts_datetime():
    # string converts to datetime correctly
    # This test fails without defining vtypes.  Entityset
    # infers time column should be numeric type
    times = pd.date_range('1/1/2011', periods=3, freq='H')
    time_strs = times.strftime('%Y-%m-%d')
    df = pd.DataFrame({'id': [0, 1, 2], 'time': time_strs})
    vtypes = {'id': variable_types.Categorical,
              'time': variable_types.Datetime}

    entityset = EntitySet(id='test')
    entityset._import_from_dataframe(entity_id='test_entity', index='id',
                                     time_index="time", variable_types=vtypes,
                                     dataframe=df)
    pd_col = entityset['test_entity'].df['time']
    # assert type(entityset['test_entity']['time']) == variable_types.Datetime
    assert type(pd_col[0]) == pd.Timestamp
def test_entity_descriptions(es):
    _es = EntitySet(es.id)
    for entity in es.metadata.entities:
        description = serialize.entity_to_description(entity)
        deserialize.description_to_entity(description, _es)
        _entity = _es[description['id']]
        _entity.last_time_index = entity.last_time_index
        assert entity.__eq__(_entity, deep=True)
예제 #28
0
def test_single_table_ks_entityset_ids_not_sorted():
    primitives_list = [
        'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words'
    ]

    ks_es = EntitySet(id="ks_es")
    df = pd.DataFrame({
        "id": [2, 0, 1, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    values_dd = ks.from_pandas(df)
    ltypes = {
        "values": Integer,
        "dates": Datetime,
        "strings": NaturalLanguage,
    }
    ks_es.add_dataframe(dataframe_name="data",
                        dataframe=values_dd,
                        index="id",
                        logical_types=ltypes)

    ks_fm, _ = ft.dfs(entityset=ks_es,
                      target_dataframe_name="data",
                      trans_primitives=primitives_list)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(dataframe_name="data",
                        dataframe=df,
                        index="id",
                        logical_types=ltypes)

    fm, _ = ft.dfs(entityset=pd_es,
                   target_dataframe_name="data",
                   trans_primitives=primitives_list)

    ks_computed_fm = ks_fm.to_pandas().set_index('id').loc[fm.index]
    # Koalas dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing
    pd.testing.assert_frame_equal(fm.astype(ks_computed_fm.dtypes),
                                  ks_computed_fm)
예제 #29
0
    def test_converts_datetime(self):
        # string converts to datetime correctly
        # This test fails without defining vtypes.  Entityset
        # infers time column should be numeric type
        times = pd.date_range('1/1/2011', periods=3, freq='H')
        time_strs = times.strftime('%Y-%m-%d')
        df = pd.DataFrame({'id': [0, 1, 2], 'time': time_strs})
        vtypes = {'id': variable_types.Categorical,
                  'time': variable_types.Datetime}

        entityset = EntitySet(id='test')
        entityset._import_from_dataframe(entity_id='test_entity', index='id',
                                         time_index="time", variable_types=vtypes,
                                         dataframe=df)
        pd_col = entityset.get_column_data('test_entity', 'time')
        # assert type(es['test_entity']['time']) == variable_types.Datetime
        assert type(pd_col[0]) == pd.Timestamp
예제 #30
0
def make_ecommerce_entityset(with_integer_time_index=False):
    """ Makes a entityset with the following shape:

          R         Regions
         / \\       .
        S   C       Stores, Customers
            |       .
            S   P   Sessions, Products
             \\ /   .
              L     Log
    """
    dataframes = make_ecommerce_dataframes(
        with_integer_time_index=with_integer_time_index)
    entities = dataframes.keys()
    es_id = 'ecommerce'
    if with_integer_time_index:
        es_id += "_int_time_index"

    variable_types = make_variable_types(
        with_integer_time_index=with_integer_time_index)
    time_indexes = make_time_indexes(
        with_integer_time_index=with_integer_time_index)

    es = EntitySet(id=es_id)

    for entity in entities:
        time_index = time_indexes.get(entity, None)
        ti_name = None
        secondary = None
        if time_index is not None:
            ti_name = time_index['name']
            secondary = time_index['secondary']
        df = dataframes[entity]
        es.entity_from_dataframe(entity,
                                 df,
                                 index='id',
                                 variable_types=variable_types[entity],
                                 time_index=ti_name,
                                 secondary_time_index=secondary)

    es.normalize_entity('customers',
                        'cohorts',
                        'cohort',
                        additional_variables=['cohort_name'],
                        make_time_index=True,
                        new_entity_time_index='cohort_end')

    es.add_relationships([
        Relationship(es[u'régions']['id'], es['customers'][u'région_id']),
        Relationship(es[u'régions']['id'], es['stores'][u'région_id']),
        Relationship(es['customers']['id'], es['sessions']['customer_id']),
        Relationship(es['sessions']['id'], es['log']['session_id']),
        Relationship(es['products']['id'], es['log']['product_id'])
    ])

    return es
예제 #31
0
def make_ecommerce_entityset(with_integer_time_index=False):
    """ Makes a entityset with the following shape:

          R         Regions
         / \\       .
        S   C       Stores, Customers
            |       .
            S   P   Sessions, Products
             \\ /   .
              L     Log
    """
    dataframes = make_ecommerce_dataframes(
        with_integer_time_index=with_integer_time_index)
    dataframe_names = dataframes.keys()
    es_id = 'ecommerce'
    if with_integer_time_index:
        es_id += "_int_time_index"

    logical_types = make_logical_types(
        with_integer_time_index=with_integer_time_index)
    semantic_tags = make_semantic_tags()
    time_indexes = make_time_indexes(
        with_integer_time_index=with_integer_time_index)

    es = EntitySet(id=es_id)

    for df_name in dataframe_names:
        time_index = time_indexes.get(df_name, None)
        ti_name = None
        secondary = None
        if time_index is not None:
            ti_name = time_index['name']
            secondary = time_index['secondary']
        df = dataframes[df_name]
        es.add_dataframe(df,
                         dataframe_name=df_name,
                         index='id',
                         logical_types=logical_types[df_name],
                         semantic_tags=semantic_tags[df_name],
                         time_index=ti_name,
                         secondary_time_index=secondary)

    es.normalize_dataframe('customers',
                           'cohorts',
                           'cohort',
                           additional_columns=['cohort_name'],
                           make_time_index=True,
                           new_dataframe_time_index='cohort_end')

    es.add_relationships([(u'régions', 'id', 'customers', u'région_id'),
                          (u'régions', 'id', 'stores', u'région_id'),
                          ('customers', 'id', 'sessions', 'customer_id'),
                          ('sessions', 'id', 'log', 'session_id'),
                          ('products', 'id', 'log', 'product_id')])

    return es
예제 #32
0
def test_custom_variable_descriptions():

    class ItemList(Categorical):
        type_string = "item_list"
        _default_pandas_dtype = list

    es = EntitySet()
    variables = {'item_list': ItemList, 'time_index': TimeIndex, 'index': Index}
    dataframe = pd.DataFrame(columns=list(variables))
    es.entity_from_dataframe(
        'custom_variable', dataframe, index='index',
        time_index='time_index', variable_types=variables)
    entity = es['custom_variable']
    for variable in entity.variables:
        description = variable.to_data_description()
        _variable = deserialize.description_to_variable(description, entity=entity)
        assert variable.__eq__(_variable)
예제 #33
0
def datetime_es():
    cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]})
    transactions_df = pd.DataFrame({
        "id": [1, 2, 3, 4, 5],
        "card_id": [1, 1, 5, 1, 5],
        "transaction_time":
        pd.to_datetime([
            '2011-2-28 04:00', '2012-2-28 05:00', '2012-2-29 06:00',
            '2012-3-1 08:00', '2014-4-1 10:00'
        ]),
        "fraud": [True, False, False, False, True]
    })

    datetime_es = EntitySet(id="fraud_data")
    datetime_es = datetime_es.entity_from_dataframe(
        entity_id="transactions",
        dataframe=transactions_df,
        index="id",
        time_index="transaction_time")

    datetime_es = datetime_es.entity_from_dataframe(entity_id="cards",
                                                    dataframe=cards_df,
                                                    index="id")
    relationship = Relationship(datetime_es["cards"]["id"],
                                datetime_es["transactions"]["card_id"])
    datetime_es = datetime_es.add_relationship(relationship)
    datetime_es.add_last_time_indexes()
    return datetime_es
예제 #34
0
def test_single_table_ks_entityset_dates_not_sorted():
    ks_es = EntitySet(id="ks_es")
    df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ]
    })

    primitives_list = ['absolute', 'is_weekend', 'year', 'day']
    values_dd = ks.from_pandas(df)
    vtypes = {
        "id": ft.variable_types.Id,
        "values": ft.variable_types.Numeric,
        "dates": ft.variable_types.Datetime,
    }
    ks_es.entity_from_dataframe(entity_id="data",
                                dataframe=values_dd,
                                index="id",
                                time_index="dates",
                                variable_types=vtypes)

    ks_fm, _ = ft.dfs(entityset=ks_es,
                      target_entity="data",
                      trans_primitives=primitives_list,
                      max_depth=1)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.entity_from_dataframe(entity_id="data",
                                dataframe=df,
                                index="id",
                                time_index="dates")

    fm, _ = ft.dfs(entityset=pd_es,
                   target_entity="data",
                   trans_primitives=primitives_list,
                   max_depth=1)

    pd.testing.assert_frame_equal(
        fm,
        ks_fm.to_pandas().set_index('id').loc[fm.index])
예제 #35
0
def test_sets_time_when_adding_entity():
    transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                    "card_id": [1, 2, 1, 3, 4, 5],
                                    "transaction_time": [10, 12, 13, 20, 21, 20],
                                    "fraud": [True, False, False, False, True, True]})
    accounts_df = pd.DataFrame({"id": [3, 4, 5],
                                "signup_date": [datetime(2002, 5, 1),
                                                datetime(2006, 3, 20),
                                                datetime(2011, 11, 11)]})
    accounts_df_string = pd.DataFrame({"id": [3, 4, 5],
                                       "signup_date": ["element",
                                                       "exporting",
                                                       "editable"]})
    # create empty entityset
    es = EntitySet("fraud")
    # assert it's not set
    assert getattr(es, "time_type", None) is None
    # add entity
    es.entity_from_dataframe("transactions",
                             transactions_df,
                             index="id",
                             time_index="transaction_time")
    # assert time_type is set
    assert es.time_type == variable_types.NumericTimeIndex
    # add another entity
    es.normalize_entity("transactions",
                        "cards",
                        "card_id",
                        make_time_index=True)
    # assert time_type unchanged
    assert es.time_type == variable_types.NumericTimeIndex
    # add wrong time type entity
    error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error_text):
        es.entity_from_dataframe("accounts",
                                 accounts_df,
                                 index="id",
                                 time_index="signup_date")
    # add non time type as time index
    error_text = "Attempted to convert all string column signup_date to numeric"
    with pytest.raises(TypeError, match=error_text):
        es.entity_from_dataframe("accounts",
                                 accounts_df_string,
                                 index="id",
                                 time_index="signup_date")
예제 #36
0
    def test_calculates_statistics_on_init(self):
        df = pd.DataFrame({
            'id': [0, 1, 2],
            'time': [datetime(2011, 4, 9, 10, 31, 3 * i) for i in range(3)],
            'category': ['a', 'b', 'a'],
            'number': [4, 5, 6],
            'boolean': [True, False, True],
            'boolean_with_nan': [True, False, np.nan]
        })
        vtypes = {
            'id': variable_types.Categorical,
            'time': variable_types.Datetime,
            'category': variable_types.Categorical,
            'number': variable_types.Numeric,
            'boolean': variable_types.Boolean,
            'boolean_with_nan': variable_types.Boolean
        }
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe('stats_test_entity',
                                        df,
                                        'id',
                                        variable_types=vtypes)
        e = entityset["stats_test_entity"]
        # numerics don't have nunique or percent_unique defined
        for v in ['time', 'category', 'number']:
            assert e[v].count == 3

        for v in ['time', 'number']:
            with pytest.raises(AttributeError):
                e[v].nunique
            with pytest.raises(AttributeError):
                e[v].percent_unique

        # 'id' column automatically parsed as id
        assert e['id'].count == 3

        # categoricals have nunique and percent_unique defined
        assert e['category'].nunique == 2
        assert e['category'].percent_unique == 2. / 3

        # booleans have count and number of true/false labels defined
        assert e['boolean'].count == 3
        # assert e['boolean'].num_true == 3
        assert e['boolean'].num_true == 2
        assert e['boolean'].num_false == 1
예제 #37
0
def datetime_es():
    cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]})
    transactions_df = pd.DataFrame(
        {
            "id": [1, 2, 3, 4, 5],
            "card_id": [1, 1, 5, 1, 5],
            "transaction_time": pd.to_datetime(
                [
                    "2011-2-28 04:00",
                    "2012-2-28 05:00",
                    "2012-2-29 06:00",
                    "2012-3-1 08:00",
                    "2014-4-1 10:00",
                ]
            ),
            "fraud": [True, False, False, False, True],
        }
    )

    datetime_es = EntitySet(id="fraud_data")
    datetime_es = datetime_es.add_dataframe(
        dataframe_name="transactions",
        dataframe=transactions_df,
        index="id",
        time_index="transaction_time",
    )

    datetime_es = datetime_es.add_dataframe(
        dataframe_name="cards", dataframe=cards_df, index="id"
    )

    datetime_es = datetime_es.add_relationship("cards", "id", "transactions", "card_id")
    datetime_es.add_last_time_indexes()
    return datetime_es
예제 #38
0
def test_serialization(entityset):
    dirname = os.path.dirname(integration_data.__file__)
    path = os.path.join(dirname, 'test_entityset.p')
    if os.path.exists(path):
        shutil.rmtree(path)
    entityset.to_pickle(path)
    new_es = EntitySet.read_pickle(path)
    assert entityset.__eq__(new_es, deep=True)
    shutil.rmtree(path)
예제 #39
0
def test_create_entity_with_make_index():
    values = [1, 12, -23, 27]
    df = pd.DataFrame({"values": values})
    dask_df = dd.from_pandas(df, npartitions=2)
    dask_es = EntitySet(id="dask_es")
    vtypes = {"values": ft.variable_types.Numeric}
    dask_es.entity_from_dataframe(entity_id="new_entity",
                                  dataframe=dask_df,
                                  make_index=True,
                                  index="new_index",
                                  variable_types=vtypes)

    expected_df = pd.DataFrame({
        "new_index": range(len(values)),
        "values": values
    })
    pd.testing.assert_frame_equal(expected_df,
                                  dask_es['new_entity'].df.compute())
예제 #40
0
    def test_checks_time_type_setting_secondary_time_index(self, entityset):
        # entityset is timestamp time type
        assert entityset.time_type == variable_types.DatetimeTimeIndex
        # add secondary index that is timestamp type
        new_2nd_ti = {'upgrade_date': ['upgrade_date', 'favorite_quote'],
                      'cancel_date': ['cancel_date', 'cancel_reason']}
        entityset["customers"].set_secondary_time_index(new_2nd_ti)
        assert entityset.time_type == variable_types.DatetimeTimeIndex
        # add secondary index that is numeric type
        new_2nd_ti = {'age': ['age', 'loves_ice_cream']}
        with pytest.raises(TypeError):
            entityset["customers"].set_secondary_time_index(new_2nd_ti)
        # add secondary index that is non-time type
        new_2nd_ti = {'favorite_quote': ['favorite_quote', 'loves_ice_cream']}
        with pytest.raises(TypeError):
            entityset["customers"].set_secondary_time_index(new_2nd_ti)
        # add mismatched pair of secondary time indexes
        new_2nd_ti = {'upgrade_date': ['upgrade_date', 'favorite_quote'],
                      'age': ['age', 'loves_ice_cream']}
        with pytest.raises(TypeError):
            entityset["customers"].set_secondary_time_index(new_2nd_ti)

        # create entityset with numeric time type
        cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]})
        transactions_df = pd.DataFrame({
            "id": [1, 2, 3, 4, 5, 6],
            "card_id": [1, 2, 1, 3, 4, 5],
            "transaction_time": [10, 12, 13, 20, 21, 20],
            "fraud_decision_time": [11, 14, 15, 21, 22, 21],
            "transaction_city": ["City A"] * 6,
            "transaction_date": [datetime(1989, 2, i) for i in range(1, 7)],
            "fraud": [True, False, False, False, True, True]
        })
        entities = {
            "cards": (cards_df, "id"),
            "transactions": (transactions_df, "id", "transaction_time")
        }
        relationships = [("cards", "id", "transactions", "card_id")]
        card_es = EntitySet("fraud", entities, relationships)
        assert card_es.time_type == variable_types.NumericTimeIndex
        # add secondary index that is numeric time type
        new_2nd_ti = {'fraud_decision_time': ['fraud_decision_time', 'fraud']}
        card_es['transactions'].set_secondary_time_index(new_2nd_ti)
        assert card_es.time_type == variable_types.NumericTimeIndex
        # add secondary index that is timestamp type
        new_2nd_ti = {'transaction_date': ['transaction_date', 'fraud']}
        with pytest.raises(TypeError):
            card_es['transactions'].set_secondary_time_index(new_2nd_ti)
        # add secondary index that is non-time type
        new_2nd_ti = {'transaction_city': ['transaction_city', 'fraud']}
        with pytest.raises(TypeError):
            card_es['transactions'].set_secondary_time_index(new_2nd_ti)
        # add mixed secondary time indexes
        new_2nd_ti = {'transaction_city': ['transaction_city', 'fraud'],
                      'fraud_decision_time': ['fraud_decision_time', 'fraud']}
        with pytest.raises(TypeError):
            card_es['transactions'].set_secondary_time_index(new_2nd_ti)
예제 #41
0
def test_add_dataframe_with_make_index():
    values = [1, 12, -23, 27]
    df = pd.DataFrame({"values": values})
    dask_df = dd.from_pandas(df, npartitions=2)
    dask_es = EntitySet(id="dask_es")
    logical_types = {"values": Integer}
    dask_es.add_dataframe(dataframe_name="new_dataframe",
                          dataframe=dask_df,
                          make_index=True,
                          index="new_index",
                          logical_types=logical_types)

    expected_df = pd.DataFrame({
        "values": values,
        "new_index": range(len(values))
    })
    pd.testing.assert_frame_equal(expected_df,
                                  dask_es['new_dataframe'].compute())
예제 #42
0
def test_single_table_ks_entityset_dates_not_sorted():
    ks_es = EntitySet(id="ks_es")
    df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ]
    })

    primitives_list = ['absolute', 'is_weekend', 'year', 'day']
    values_dd = ks.from_pandas(df)
    ltypes = {
        "values": Integer,
        "dates": Datetime,
    }
    ks_es.add_dataframe(dataframe_name="data",
                        dataframe=values_dd,
                        index="id",
                        time_index="dates",
                        logical_types=ltypes)

    ks_fm, _ = ft.dfs(entityset=ks_es,
                      target_dataframe_name="data",
                      trans_primitives=primitives_list,
                      max_depth=1)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(dataframe_name="data",
                        dataframe=df,
                        index="id",
                        time_index="dates",
                        logical_types=ltypes)

    fm, _ = ft.dfs(entityset=pd_es,
                   target_dataframe_name="data",
                   trans_primitives=primitives_list,
                   max_depth=1)

    ks_fm = ks_fm.to_pandas().set_index('id').loc[fm.index]
    pd.testing.assert_frame_equal(fm.astype(ks_fm.dtypes), ks_fm)
예제 #43
0
def test_make_time_index_keeps_original_sorting():
    trips = {
        'trip_id': [999 - i for i in range(1000)],
        'flight_time': [datetime(1997, 4, 1) for i in range(1000)],
        'flight_id': [1 for i in range(350)] + [2 for i in range(650)]
    }
    order = [i for i in range(1000)]
    df = pd.DataFrame.from_dict(trips)
    es = EntitySet('flights')
    es.entity_from_dataframe("trips",
                             dataframe=df,
                             index="trip_id",
                             time_index='flight_time')
    assert (es['trips'].df['trip_id'] == order).all()
    es.normalize_entity(base_entity_id="trips",
                        new_entity_id="flights",
                        index="flight_id",
                        make_time_index=True)
    assert (es['trips'].df['trip_id'] == order).all()
예제 #44
0
def test_already_sorted_parameter():
    transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                    "transaction_time": [datetime(2014, 4, 6),
                                                         datetime(
                                                             2012, 4, 8),
                                                         datetime(
                                                             2012, 4, 8),
                                                         datetime(
                                                             2013, 4, 8),
                                                         datetime(
                                                             2015, 4, 8),
                                                         datetime(2016, 4, 9)]})

    es = EntitySet(id='test')
    es.entity_from_dataframe('t',
                             transactions_df,
                             index='id',
                             time_index="transaction_time",
                             already_sorted=True)
    times = es["t"].df.transaction_time.tolist()
    assert times == transactions_df.transaction_time.tolist()
예제 #45
0
def test_sets_time_when_adding_entity():
    transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                    "card_id": [1, 2, 1, 3, 4, 5],
                                    "transaction_time": [10, 12, 13, 20, 21, 20],
                                    "fraud": [True, False, False, False, True, True]})
    accounts_df = pd.DataFrame({"id": [3, 4, 5],
                                "signup_date": [datetime(2002, 5, 1),
                                                datetime(2006, 3, 20),
                                                datetime(2011, 11, 11)]})
    accounts_df_string = pd.DataFrame({"id": [3, 4, 5],
                                       "signup_date": ["element",
                                                       "exporting",
                                                       "editable"]})
    # create empty entityset
    entityset = EntitySet("fraud")
    # assert it's not set
    assert getattr(entityset, "time_type", None) is None
    # add entity
    entityset.entity_from_dataframe("transactions",
                                    transactions_df,
                                    index="id",
                                    time_index="transaction_time")
    # assert time_type is set
    assert entityset.time_type == variable_types.NumericTimeIndex
    # add another entity
    entityset.normalize_entity("transactions",
                               "cards",
                               "card_id",
                               make_time_index=True)
    # assert time_type unchanged
    assert entityset.time_type == variable_types.NumericTimeIndex
    # add wrong time type entity
    error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error_text):
        entityset.entity_from_dataframe("accounts",
                                        accounts_df,
                                        index="id",
                                        time_index="signup_date")
    # add non time type as time index
    error_text = "Attempted to convert all string column signup_date to numeric"
    with pytest.raises(TypeError, match=error_text):
        entityset.entity_from_dataframe("accounts",
                                        accounts_df_string,
                                        index="id",
                                        time_index="signup_date")
예제 #46
0
    def test_calculates_statistics_on_init(self):
        df = pd.DataFrame({'id': [0, 1, 2],
                           'time': [datetime(2011, 4, 9, 10, 31, 3 * i)
                                    for i in range(3)],
                           'category': ['a', 'b', 'a'],
                           'number': [4, 5, 6],
                           'boolean': [True, False, True],
                           'boolean_with_nan': [True, False, np.nan]})
        vtypes = {'id': variable_types.Categorical,
                  'time': variable_types.Datetime,
                  'category': variable_types.Categorical,
                  'number': variable_types.Numeric,
                  'boolean': variable_types.Boolean,
                  'boolean_with_nan': variable_types.Boolean}
        entityset = EntitySet(id='test')
        entityset.entity_from_dataframe('stats_test_entity', df, 'id',
                                        variable_types=vtypes)
        e = entityset["stats_test_entity"]
        # numerics don't have nunique or percent_unique defined
        for v in ['time', 'category', 'number']:
            assert e[v].count == 3

        for v in ['time', 'number']:
            with pytest.raises(AttributeError):
                e[v].nunique
            with pytest.raises(AttributeError):
                e[v].percent_unique

        # 'id' column automatically parsed as id
        assert e['id'].count == 3

        # categoricals have nunique and percent_unique defined
        assert e['category'].nunique == 2
        assert e['category'].percent_unique == 2. / 3

        # booleans have count and number of true/false labels defined
        assert e['boolean'].count == 3
        # assert e['boolean'].num_true == 3
        assert e['boolean'].num_true == 2
        assert e['boolean'].num_false == 1
예제 #47
0
def test_converts_variable_types_on_init():
    df = pd.DataFrame({'id': [0, 1, 2],
                       'category': ['a', 'b', 'a'],
                       'category_int': [1, 2, 3],
                       'ints': ['1', '2', '3'],
                       'floats': ['1', '2', '3.0']})
    df["category_int"] = df["category_int"].astype("category")

    vtypes = {'id': variable_types.Categorical,
              'ints': variable_types.Numeric,
              'floats': variable_types.Numeric}
    entityset = EntitySet(id='test')
    entityset.entity_from_dataframe(entity_id='test_entity', index='id',
                                    variable_types=vtypes, dataframe=df)

    entity_df = entityset['test_entity'].df
    assert entity_df['ints'].dtype.name in variable_types.PandasTypes._pandas_numerics
    assert entity_df['floats'].dtype.name in variable_types.PandasTypes._pandas_numerics

    # this is infer from pandas dtype
    e = entityset["test_entity"]
    assert isinstance(e['category_int'], variable_types.Categorical)
예제 #48
0
 def test_sets_time_when_adding_entity(self):
     transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                     "card_id": [1, 2, 1, 3, 4, 5],
                                     "transaction_time": [10, 12, 13, 20, 21, 20],
                                     "fraud": [True, False, False, False, True, True]})
     accounts_df = pd.DataFrame({"id": [3, 4, 5],
                                 "signup_date": [datetime(2002, 5, 1),
                                                 datetime(2006, 3, 20),
                                                 datetime(2011, 11, 11)]})
     accounts_df_string = pd.DataFrame({"id": [3, 4, 5],
                                        "signup_date": ["element",
                                                        "exporting",
                                                        "editable"]})
     # create empty entityset
     entityset = EntitySet("fraud")
     # assert it's not set
     assert getattr(entityset, "time_type", None) is None
     # add entity
     entityset.entity_from_dataframe("transactions",
                                     transactions_df,
                                     index="id",
                                     time_index="transaction_time")
     # assert time_type is set
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add another entity
     entityset.normalize_entity("transactions",
                                "cards",
                                "card_id",
                                make_time_index=True)
     # assert time_type unchanged
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add wrong time type entity
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df,
                                         index="id",
                                         time_index="signup_date")
     # add non time type as time index
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df_string,
                                         index="id",
                                         time_index="signup_date")