Exemplo n.º 1
0
def test_normalize_ww_init():
    es = EntitySet()
    df = pd.DataFrame({
        'id': [1, 2, 3, 4],
        'col': ['a', 'b', 'c', 'd'],
        'df2_id': [1, 1, 2, 2],
        'df2_col': [True, False, True, True]
    })

    df.ww.init(index='id', name='test_name')
    es.add_dataframe(dataframe=df)

    assert es['test_name'].ww.name == 'test_name'
    assert es['test_name'].ww.schema.name == 'test_name'

    es.normalize_dataframe('test_name',
                           'new_df',
                           'df2_id',
                           additional_columns=['df2_col'])

    assert es['test_name'].ww.name == 'test_name'
    assert es['test_name'].ww.schema.name == 'test_name'

    assert es['new_df'].ww.name == 'new_df'
    assert es['new_df'].ww.schema.name == 'new_df'
Exemplo n.º 2
0
def test_normalize_ww_init():
    es = EntitySet()
    df = pd.DataFrame({
        "id": [1, 2, 3, 4],
        "col": ["a", "b", "c", "d"],
        "df2_id": [1, 1, 2, 2],
        "df2_col": [True, False, True, True],
    })

    df.ww.init(index="id", name="test_name")
    es.add_dataframe(dataframe=df)

    assert es["test_name"].ww.name == "test_name"
    assert es["test_name"].ww.schema.name == "test_name"

    es.normalize_dataframe("test_name",
                           "new_df",
                           "df2_id",
                           additional_columns=["df2_col"])

    assert es["test_name"].ww.name == "test_name"
    assert es["test_name"].ww.schema.name == "test_name"

    assert es["new_df"].ww.name == "new_df"
    assert es["new_df"].ww.schema.name == "new_df"
Exemplo n.º 3
0
def test_extra_woodwork_params(es):
    new_es = EntitySet()

    sessions_df = es["sessions"].ww.copy()

    assert sessions_df.ww.index == "id"
    assert sessions_df.ww.time_index is None
    assert isinstance(sessions_df.ww.logical_types["id"], Integer)

    warning_msg = (
        "A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: "
        "index, time_index, logical_types, make_index, semantic_tags, already_sorted"
    )
    with pytest.warns(UserWarning, match=warning_msg):
        new_es.add_dataframe(
            dataframe_name="sessions",
            dataframe=sessions_df,
            index="filepath",
            time_index="customer_id",
            logical_types={"id": Categorical},
            make_index=True,
            already_sorted=True,
            semantic_tags={"id": "new_tag"},
        )
    assert sessions_df.ww.index == "id"
    assert sessions_df.ww.time_index is None
    assert isinstance(sessions_df.ww.logical_types["id"], Integer)
    assert "new_tag" not in sessions_df.ww.semantic_tags
Exemplo n.º 4
0
def test_replace_dataframe():
    df = pd.DataFrame({
        'id':
        range(4),
        'full_name':
        ['Mr. John Doe', 'Doe, Mrs. Jane', 'James Brown', 'Ms. Paige Turner'],
        'email': [
            '*****@*****.**', np.nan, '*****@*****.**',
            '*****@*****.**'
        ],
        'phone_number':
        ['5555555555', '555-555-5555', '1-(555)-555-5555', '555-555-5555'],
        'age':
        pd.Series([33, None, 33, 57], dtype='Int64'),
        'signup_date': [pd.to_datetime('2020-09-01')] * 4,
        'is_registered':
        pd.Series([True, False, True, None], dtype='boolean'),
    })

    df.ww.init(name='table', index='id')
    es = EntitySet('es')
    es.add_dataframe(df)
    original_schema = es['table'].ww.schema

    new_df = df.iloc[2:]
    es.replace_dataframe('table', new_df)

    assert len(es['table']) == 2
    assert es['table'].ww.schema == original_schema
Exemplo n.º 5
0
def test_extra_woodwork_params(es):
    new_es = EntitySet()

    sessions_df = es['sessions'].ww.copy()

    assert sessions_df.ww.index == 'id'
    assert sessions_df.ww.time_index is None
    assert isinstance(sessions_df.ww.logical_types['id'], Integer)

    warning_msg = (
        'A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: '
        'index, time_index, logical_types, make_index, semantic_tags, already_sorted'
    )
    with pytest.warns(UserWarning, match=warning_msg):
        new_es.add_dataframe(dataframe_name='sessions',
                             dataframe=sessions_df,
                             index='filepath',
                             time_index='customer_id',
                             logical_types={'id': Categorical},
                             make_index=True,
                             already_sorted=True,
                             semantic_tags={'id': 'new_tag'})
    assert sessions_df.ww.index == 'id'
    assert sessions_df.ww.time_index is None
    assert isinstance(sessions_df.ww.logical_types['id'], Integer)
    assert 'new_tag' not in sessions_df.ww.semantic_tags
Exemplo n.º 6
0
def test_replace_dataframe_data_transformation(latlong_df):
    initial_df = latlong_df.copy()
    initial_df.ww.init(
        name='latlongs',
        index='string_tuple',
        logical_types={col_name: 'LatLong'
                       for col_name in initial_df.columns})
    es = EntitySet()
    es.add_dataframe(dataframe=initial_df)

    df = to_pandas(es['latlongs'])
    expected_val = (1, 2)
    if ks and isinstance(es['latlongs'], ks.DataFrame):
        expected_val = [1, 2]
    for col in latlong_df.columns:
        series = df[col]
        assert series.iloc[0] == expected_val

    es.replace_dataframe('latlongs', latlong_df)
    df = to_pandas(es['latlongs'])
    expected_val = (3, 4)
    if ks and isinstance(es['latlongs'], ks.DataFrame):
        expected_val = [3, 4]
    for col in latlong_df.columns:
        series = df[col]
        assert series.iloc[-1] == expected_val
Exemplo n.º 7
0
def test_replace_dataframe_different_dataframe_types():
    dask_es = EntitySet(id="dask_es")

    sessions = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "user": [1, 2, 1, 3],
        "time": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    sessions_dask = dd.from_pandas(sessions, npartitions=2)
    sessions_logical_types = {
        "id": Integer,
        "user": Integer,
        "time": Datetime,
        "strings": NaturalLanguage
    }
    sessions_semantic_tags = {'user': '******'}

    dask_es.add_dataframe(dataframe_name="sessions",
                          dataframe=sessions_dask,
                          index="id",
                          time_index="time",
                          logical_types=sessions_logical_types,
                          semantic_tags=sessions_semantic_tags)

    with pytest.raises(TypeError, match='Incorrect DataFrame type used'):
        dask_es.replace_dataframe('sessions', sessions)
Exemplo n.º 8
0
def test_normalize_dataframe():
    df = pd.DataFrame({
        'id':
        range(4),
        'full_name':
        ['Mr. John Doe', 'Doe, Mrs. Jane', 'James Brown', 'Ms. Paige Turner'],
        'email': [
            '*****@*****.**', np.nan, '*****@*****.**',
            '*****@*****.**'
        ],
        'phone_number':
        ['5555555555', '555-555-5555', '1-(555)-555-5555', '555-555-5555'],
        'age':
        pd.Series([33, None, 33, 57], dtype='Int64'),
        'signup_date': [pd.to_datetime('2020-09-01')] * 4,
        'is_registered':
        pd.Series([True, False, True, None], dtype='boolean'),
    })

    df.ww.init(name='first_table', index='id', time_index='signup_date')
    es = EntitySet('es')
    es.add_dataframe(df)
    es.normalize_dataframe('first_table',
                           'second_table',
                           'age',
                           additional_columns=['phone_number', 'full_name'],
                           make_time_index=True)
    assert len(es.dataframe_dict) == 2
    assert 'foreign_key' in es['first_table'].ww.semantic_tags['age']
Exemplo n.º 9
0
def test_dataframe_without_name(es):
    new_es = EntitySet()

    new_df = es['sessions'].copy()

    assert new_df.ww.schema is None

    error = 'Cannot add dataframe to EntitySet without a name. Please provide a value for the dataframe_name parameter.'
    with pytest.raises(ValueError, match=error):
        new_es.add_dataframe(new_df)
Exemplo n.º 10
0
def test_woodwork_dataframe_without_name_errors(es):
    new_es = EntitySet()

    new_df = es['sessions'].ww.copy()
    new_df.ww._schema.name = None

    assert new_df.ww.name is None

    error = 'Cannot add a Woodwork DataFrame to EntitySet without a name'
    with pytest.raises(ValueError, match=error):
        new_es.add_dataframe(new_df)
Exemplo n.º 11
0
def test_woodwork_dataframe_same_name_parameter(es):
    new_es = EntitySet()

    new_df = es["sessions"].ww.copy()
    new_df.ww._schema.name = "df_name"

    assert new_df.ww.name == "df_name"

    new_es.add_dataframe(new_df, dataframe_name="df_name")

    assert new_es["df_name"].ww.name == "df_name"
Exemplo n.º 12
0
def test_woodwork_dataframe_same_name_parameter(es):
    new_es = EntitySet()

    new_df = es['sessions'].ww.copy()
    new_df.ww._schema.name = 'df_name'

    assert new_df.ww.name == 'df_name'

    new_es.add_dataframe(new_df, dataframe_name='df_name')

    assert new_es['df_name'].ww.name == 'df_name'
Exemplo n.º 13
0
def test_dataframe_with_name_parameter(es):
    new_es = EntitySet()

    new_df = es['sessions'][['id']]

    assert new_df.ww.schema is None

    new_es.add_dataframe(new_df,
                         dataframe_name='df_name',
                         index='id',
                         logical_types={'id': 'Integer'})
    assert new_es['df_name'].ww.name == 'df_name'
Exemplo n.º 14
0
def test_add_secondary_time_index(dates_df):
    dates_df.ww.init(name='dates_table',
                     index='backwards_order',
                     time_index='dates_backwards')
    es = EntitySet('es')
    es.add_dataframe(
        dates_df,
        secondary_time_index={'repeating_dates': ['random_order', 'special']})

    assert dates_df.ww.metadata['secondary_time_index'] == {
        'repeating_dates': ['random_order', 'special', 'repeating_dates']
    }
Exemplo n.º 15
0
def test_dataframe_with_name_parameter(es):
    new_es = EntitySet()

    new_df = es["sessions"][["id"]]

    assert new_df.ww.schema is None

    new_es.add_dataframe(new_df,
                         dataframe_name="df_name",
                         index="id",
                         logical_types={"id": "Integer"})
    assert new_es["df_name"].ww.name == "df_name"
Exemplo n.º 16
0
def test_add_secondary_time_index(dates_df):
    dates_df.ww.init(name="dates_table",
                     index="backwards_order",
                     time_index="dates_backwards")
    es = EntitySet("es")
    es.add_dataframe(
        dates_df,
        secondary_time_index={"repeating_dates": ["random_order", "special"]})

    assert dates_df.ww.metadata["secondary_time_index"] == {
        "repeating_dates": ["random_order", "special", "repeating_dates"]
    }
Exemplo n.º 17
0
def test_woodwork_dataframe_ignore_conflicting_name_parameter_warning(es):
    new_es = EntitySet()

    new_df = es['sessions'].ww.copy()
    new_df.ww._schema.name = 'df_name'

    assert new_df.ww.name == 'df_name'

    warning = 'A Woodwork-initialized DataFrame was provided, so the following parameters were ignored: dataframe_name'
    with pytest.warns(UserWarning, match=warning):
        new_es.add_dataframe(new_df, dataframe_name='conflicting_name')

    assert new_es['df_name'].ww.name == 'df_name'
Exemplo n.º 18
0
def test_time_type_check_order(dates_df):
    dates_df.ww.init(name='dates_table',
                     index='backwards_order',
                     time_index='random_order')
    es = EntitySet('es')

    error = 'dates_table time index is Datetime type which differs from other entityset time indexes'
    with pytest.raises(TypeError, match=error):
        es.add_dataframe(dates_df,
                         secondary_time_index={
                             'repeating_dates': ['random_order', 'special']
                         })

    assert 'secondary_time_index' not in dates_df.ww.metadata
Exemplo n.º 19
0
def test_init_with_mismatched_time_types(dates_df):
    dates_df.ww.init(name='dates_table',
                     index='backwards_order',
                     time_index='repeating_dates')
    es = EntitySet('es')
    es.add_dataframe(dates_df,
                     secondary_time_index={'special_dates': ['special']})
    assert es.time_type == Datetime

    nums_df = pd.DataFrame({'id': [1, 2, 3], 'times': [9, 8, 7]})
    nums_df.ww.init(name='numerics_table', index='id', time_index='times')

    error = 'numerics_table time index is numeric type which differs from other entityset time indexes'
    with pytest.raises(TypeError, match=error):
        es.add_dataframe(nums_df)
Exemplo n.º 20
0
def test_init_with_mismatched_time_types(dates_df):
    dates_df.ww.init(name="dates_table",
                     index="backwards_order",
                     time_index="repeating_dates")
    es = EntitySet("es")
    es.add_dataframe(dates_df,
                     secondary_time_index={"special_dates": ["special"]})
    assert es.time_type == Datetime

    nums_df = pd.DataFrame({"id": [1, 2, 3], "times": [9, 8, 7]})
    nums_df.ww.init(name="numerics_table", index="id", time_index="times")

    error = "numerics_table time index is numeric type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error):
        es.add_dataframe(nums_df)
Exemplo n.º 21
0
def test_time_type_check_order(dates_df):
    dates_df.ww.init(name="dates_table",
                     index="backwards_order",
                     time_index="random_order")
    es = EntitySet("es")

    error = "dates_table time index is Datetime type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error):
        es.add_dataframe(
            dates_df,
            secondary_time_index={
                "repeating_dates": ["random_order", "special"]
            },
        )

    assert "secondary_time_index" not in dates_df.ww.metadata
Exemplo n.º 22
0
def test_add_dataframe_to_es(df):
    es1 = EntitySet('es')
    assert es1.dataframe_dict == {}
    es1.add_dataframe(df,
                      dataframe_name='table',
                      index='id',
                      semantic_tags={'category': 'new_tag'})
    assert len(es1.dataframe_dict) == 1

    copy_df = df.ww.copy()

    es2 = EntitySet('es')
    assert es2.dataframe_dict == {}
    es2.add_dataframe(copy_df)
    assert len(es2.dataframe_dict) == 1

    assert es1['table'].ww == es2['table'].ww
Exemplo n.º 23
0
def test_add_dataframe_to_es(df):
    es1 = EntitySet("es")
    assert es1.dataframe_dict == {}
    es1.add_dataframe(df,
                      dataframe_name="table",
                      index="id",
                      semantic_tags={"category": "new_tag"})
    assert len(es1.dataframe_dict) == 1

    copy_df = df.ww.copy()

    es2 = EntitySet("es")
    assert es2.dataframe_dict == {}
    es2.add_dataframe(copy_df)
    assert len(es2.dataframe_dict) == 1

    assert es1["table"].ww == es2["table"].ww
Exemplo n.º 24
0
def test_int_double_time_type(dates_df):
    dates_df.ww.init(name='dates_table',
                     index='backwards_order',
                     time_index='random_order',
                     logical_types={
                         'random_order': 'Integer',
                         'special': 'Double'
                     })
    es = EntitySet('es')

    # Both random_order and special are numeric, but they are different logical types
    es.add_dataframe(dates_df,
                     secondary_time_index={'special': ['dates_backwards']})

    assert isinstance(es['dates_table'].ww.logical_types['random_order'],
                      Integer)
    assert isinstance(es['dates_table'].ww.logical_types['special'], Double)

    assert es['dates_table'].ww.time_index == 'random_order'
    assert 'special' in es['dates_table'].ww.metadata['secondary_time_index']
Exemplo n.º 25
0
def test_normalize_dataframe():
    df = pd.DataFrame({
        "id":
        range(4),
        "full_name": [
            "Mr. John Doe",
            "Doe, Mrs. Jane",
            "James Brown",
            "Ms. Paige Turner",
        ],
        "email": [
            "*****@*****.**",
            np.nan,
            "*****@*****.**",
            "*****@*****.**",
        ],
        "phone_number": [
            "5555555555",
            "555-555-5555",
            "1-(555)-555-5555",
            "555-555-5555",
        ],
        "age":
        pd.Series([33, None, 33, 57], dtype="Int64"),
        "signup_date": [pd.to_datetime("2020-09-01")] * 4,
        "is_registered":
        pd.Series([True, False, True, None], dtype="boolean"),
    })

    df.ww.init(name="first_table", index="id", time_index="signup_date")
    es = EntitySet("es")
    es.add_dataframe(df)
    es.normalize_dataframe(
        "first_table",
        "second_table",
        "age",
        additional_columns=["phone_number", "full_name"],
        make_time_index=True,
    )
    assert len(es.dataframe_dict) == 2
    assert "foreign_key" in es["first_table"].ww.semantic_tags["age"]
Exemplo n.º 26
0
def test_int_double_time_type(dates_df):
    dates_df.ww.init(
        name="dates_table",
        index="backwards_order",
        time_index="random_order",
        logical_types={
            "random_order": "Integer",
            "special": "Double"
        },
    )
    es = EntitySet("es")

    # Both random_order and special are numeric, but they are different logical types
    es.add_dataframe(dates_df,
                     secondary_time_index={"special": ["dates_backwards"]})

    assert isinstance(es["dates_table"].ww.logical_types["random_order"],
                      Integer)
    assert isinstance(es["dates_table"].ww.logical_types["special"], Double)

    assert es["dates_table"].ww.time_index == "random_order"
    assert "special" in es["dates_table"].ww.metadata["secondary_time_index"]
Exemplo n.º 27
0
def test_replace_dataframe():
    df = pd.DataFrame({
        "id":
        range(4),
        "full_name": [
            "Mr. John Doe",
            "Doe, Mrs. Jane",
            "James Brown",
            "Ms. Paige Turner",
        ],
        "email": [
            "*****@*****.**",
            np.nan,
            "*****@*****.**",
            "*****@*****.**",
        ],
        "phone_number": [
            "5555555555",
            "555-555-5555",
            "1-(555)-555-5555",
            "555-555-5555",
        ],
        "age":
        pd.Series([33, None, 33, 57], dtype="Int64"),
        "signup_date": [pd.to_datetime("2020-09-01")] * 4,
        "is_registered":
        pd.Series([True, False, True, None], dtype="boolean"),
    })

    df.ww.init(name="table", index="id")
    es = EntitySet("es")
    es.add_dataframe(df)
    original_schema = es["table"].ww.schema

    new_df = df.iloc[2:]
    es.replace_dataframe("table", new_df)

    assert len(es["table"]) == 2
    assert es["table"].ww.schema == original_schema
Exemplo n.º 28
0
def test_add_time_index_through_woodwork_different_type(dates_df):
    dates_df.ww.init(name='dates_table',
                     index='backwards_order',
                     time_index='dates_backwards')
    es = EntitySet('es')

    es.add_dataframe(
        dates_df,
        secondary_time_index={'repeating_dates': ['random_order', 'special']})

    assert dates_df.ww.metadata['secondary_time_index'] == {
        'repeating_dates': ['random_order', 'special', 'repeating_dates']
    }
    assert es.time_type == Datetime

    assert es._check_uniform_time_index(es['dates_table']) is None

    dates_df.ww.set_time_index('random_order')
    assert dates_df.ww.time_index == 'random_order'

    error = 'dates_table time index is numeric type which differs from other entityset time indexes'
    with pytest.raises(TypeError, match=error):
        es._check_uniform_time_index(es['dates_table'])
Exemplo n.º 29
0
def test_add_time_index_through_woodwork_different_type(dates_df):
    dates_df.ww.init(name="dates_table",
                     index="backwards_order",
                     time_index="dates_backwards")
    es = EntitySet("es")

    es.add_dataframe(
        dates_df,
        secondary_time_index={"repeating_dates": ["random_order", "special"]})

    assert dates_df.ww.metadata["secondary_time_index"] == {
        "repeating_dates": ["random_order", "special", "repeating_dates"]
    }
    assert es.time_type == Datetime

    assert es._check_uniform_time_index(es["dates_table"]) is None

    dates_df.ww.set_time_index("random_order")
    assert dates_df.ww.time_index == "random_order"

    error = "dates_table time index is numeric type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error):
        es._check_uniform_time_index(es["dates_table"])