示例#1
0
def description_to_entityset(description, **kwargs):
    '''Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    '''
    check_schema_version(description, 'entityset')

    from featuretools.entityset import EntitySet

    # If data description was not read from disk, path is None.
    path = description.get('path')
    entityset = EntitySet(description['id'])

    for df in description['dataframes'].values():
        if path is not None:
            data_path = os.path.join(path, 'data', df['name'])
            dataframe = read_woodwork_table(data_path,
                                            validate=False,
                                            **kwargs)
        else:
            dataframe = empty_dataframe(df)

        entityset.add_dataframe(dataframe)

    for relationship in description['relationships']:
        rel = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship=rel)

    return entityset
示例#2
0
def datetime_es():
    cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]})
    transactions_df = pd.DataFrame(
        {
            "id": [1, 2, 3, 4, 5],
            "card_id": [1, 1, 5, 1, 5],
            "transaction_time": pd.to_datetime(
                [
                    "2011-2-28 04:00",
                    "2012-2-28 05:00",
                    "2012-2-29 06:00",
                    "2012-3-1 08:00",
                    "2014-4-1 10:00",
                ]
            ),
            "fraud": [True, False, False, False, True],
        }
    )

    datetime_es = EntitySet(id="fraud_data")
    datetime_es = datetime_es.add_dataframe(
        dataframe_name="transactions",
        dataframe=transactions_df,
        index="id",
        time_index="transaction_time",
    )

    datetime_es = datetime_es.add_dataframe(
        dataframe_name="cards", dataframe=cards_df, index="id"
    )

    datetime_es = datetime_es.add_relationship("cards", "id", "transactions", "card_id")
    datetime_es.add_last_time_indexes()
    return datetime_es
示例#3
0
def make_ecommerce_entityset(with_integer_time_index=False):
    """ Makes a entityset with the following shape:

          R         Regions
         / \\       .
        S   C       Stores, Customers
            |       .
            S   P   Sessions, Products
             \\ /   .
              L     Log
    """
    dataframes = make_ecommerce_dataframes(
        with_integer_time_index=with_integer_time_index)
    dataframe_names = dataframes.keys()
    es_id = 'ecommerce'
    if with_integer_time_index:
        es_id += "_int_time_index"

    logical_types = make_logical_types(
        with_integer_time_index=with_integer_time_index)
    semantic_tags = make_semantic_tags()
    time_indexes = make_time_indexes(
        with_integer_time_index=with_integer_time_index)

    es = EntitySet(id=es_id)

    for df_name in dataframe_names:
        time_index = time_indexes.get(df_name, None)
        ti_name = None
        secondary = None
        if time_index is not None:
            ti_name = time_index['name']
            secondary = time_index['secondary']
        df = dataframes[df_name]
        es.add_dataframe(df,
                         dataframe_name=df_name,
                         index='id',
                         logical_types=logical_types[df_name],
                         semantic_tags=semantic_tags[df_name],
                         time_index=ti_name,
                         secondary_time_index=secondary)

    es.normalize_dataframe('customers',
                           'cohorts',
                           'cohort',
                           additional_columns=['cohort_name'],
                           make_time_index=True,
                           new_dataframe_time_index='cohort_end')

    es.add_relationships([(u'régions', 'id', 'customers', u'région_id'),
                          (u'régions', 'id', 'stores', u'région_id'),
                          ('customers', 'id', 'sessions', 'customer_id'),
                          ('sessions', 'id', 'log', 'session_id'),
                          ('products', 'id', 'log', 'product_id')])

    return es
示例#4
0
def test_single_table_dask_entityset_with_instance_ids():
    primitives_list = [
        "absolute",
        "is_weekend",
        "year",
        "day",
        "num_characters",
        "num_words",
    ]
    instance_ids = [0, 1, 3]

    dask_es = EntitySet(id="dask_es")
    df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime("2019-01-10"),
            pd.to_datetime("2019-02-03"),
            pd.to_datetime("2019-01-01"),
            pd.to_datetime("2017-08-25"),
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""],
    })

    values_dd = dd.from_pandas(df, npartitions=2)
    ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage}
    dask_es.add_dataframe(dataframe_name="data",
                          dataframe=values_dd,
                          index="id",
                          logical_types=ltypes)

    dask_fm, _ = ft.dfs(
        entityset=dask_es,
        target_dataframe_name="data",
        trans_primitives=primitives_list,
        instance_ids=instance_ids,
    )

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(dataframe_name="data",
                        dataframe=df,
                        index="id",
                        logical_types=ltypes)

    fm, _ = ft.dfs(
        entityset=pd_es,
        target_dataframe_name="data",
        trans_primitives=primitives_list,
        instance_ids=instance_ids,
    )

    # Make sure both indexes are sorted the same
    dask_fm = dask_fm.compute().astype({"id": "int64"})
    pd.testing.assert_frame_equal(fm,
                                  dask_fm.set_index("id").loc[fm.index],
                                  check_dtype=False)
示例#5
0
def test_single_table_spark_entityset_with_instance_ids():
    primitives_list = [
        "absolute",
        "is_weekend",
        "year",
        "day",
        "num_characters",
        "num_words",
    ]
    instance_ids = [0, 1, 3]

    spark_es = EntitySet(id="spark_es")
    df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime("2019-01-10"),
            pd.to_datetime("2019-02-03"),
            pd.to_datetime("2019-01-01"),
            pd.to_datetime("2017-08-25"),
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""],
    })

    values_dd = ps.from_pandas(df)
    ltypes = {"values": Integer, "dates": Datetime, "strings": NaturalLanguage}
    spark_es.add_dataframe(dataframe_name="data",
                           dataframe=values_dd,
                           index="id",
                           logical_types=ltypes)

    spark_fm, _ = ft.dfs(
        entityset=spark_es,
        target_dataframe_name="data",
        trans_primitives=primitives_list,
        instance_ids=instance_ids,
    )

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(dataframe_name="data",
                        dataframe=df,
                        index="id",
                        logical_types=ltypes)

    fm, _ = ft.dfs(
        entityset=pd_es,
        target_dataframe_name="data",
        trans_primitives=primitives_list,
        instance_ids=instance_ids,
    )

    spark_fm = spark_fm.to_pandas().astype({"id": "int64"})
    spark_computed_fm = spark_fm.set_index("id").loc[fm.index]
    # Spark dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing
    pd.testing.assert_frame_equal(fm.astype(spark_computed_fm.dtypes),
                                  spark_computed_fm)
示例#6
0
def test_single_table_dask_entityset_cutoff_time_df():
    primitives_list = ['absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words']

    dask_es = EntitySet(id="dask_es")
    df = pd.DataFrame({"id": [0, 1, 2],
                       "values": [1, 12, -34],
                       "dates": [pd.to_datetime('2019-01-10'),
                                 pd.to_datetime('2019-02-03'),
                                 pd.to_datetime('2019-01-01')],
                       "strings": ["I am a string",
                                   "23",
                                   "abcdef ghijk"]})
    values_dd = dd.from_pandas(df, npartitions=2)
    ltypes = {
        "values": IntegerNullable,
        "dates": Datetime,
        "strings": NaturalLanguage
    }
    dask_es.add_dataframe(
        dataframe_name="data",
        dataframe=values_dd,
        index="id",
        time_index="dates",
        logical_types=ltypes)

    ids = [0, 1, 2, 0]
    times = [pd.Timestamp("2019-01-05 04:00"),
             pd.Timestamp("2019-01-05 04:00"),
             pd.Timestamp("2019-01-05 04:00"),
             pd.Timestamp("2019-01-15 04:00")]
    labels = [True, False, True, False]
    cutoff_times = pd.DataFrame({"id": ids, "time": times, "labels": labels}, columns=["id", "time", "labels"])

    dask_fm, _ = ft.dfs(entityset=dask_es,
                        target_dataframe_name="data",
                        trans_primitives=primitives_list,
                        cutoff_time=cutoff_times)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(
        dataframe_name="data",
        dataframe=df,
        index="id",
        time_index="dates",
        logical_types=ltypes)

    fm, _ = ft.dfs(entityset=pd_es,
                   target_dataframe_name="data",
                   trans_primitives=primitives_list,
                   cutoff_time=cutoff_times)
    # Because row ordering with Dask is not guaranteed, we need to sort on two columns to make sure that values
    # for instance id 0 are compared correctly. Also, make sure the boolean column has the same dtype.
    fm = fm.sort_values(['id', 'labels'])
    dask_fm = dask_fm.compute().set_index('id').sort_values(['id', 'labels'])
    dask_fm['IS_WEEKEND(dates)'] = dask_fm['IS_WEEKEND(dates)'].astype(fm['IS_WEEKEND(dates)'].dtype)
    pd.testing.assert_frame_equal(fm, dask_fm)
示例#7
0
def test_single_table_dask_entityset_dates_not_sorted():
    dask_es = EntitySet(id="dask_es")
    df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime("2019-01-10"),
            pd.to_datetime("2019-02-03"),
            pd.to_datetime("2019-01-01"),
            pd.to_datetime("2017-08-25"),
        ],
    })

    primitives_list = ["absolute", "is_weekend", "year", "day"]
    values_dd = dd.from_pandas(df, npartitions=1)
    ltypes = {
        "values": Integer,
        "dates": Datetime,
    }
    dask_es.add_dataframe(
        dataframe_name="data",
        dataframe=values_dd,
        index="id",
        time_index="dates",
        logical_types=ltypes,
    )

    dask_fm, _ = ft.dfs(
        entityset=dask_es,
        target_dataframe_name="data",
        trans_primitives=primitives_list,
        max_depth=1,
    )

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(
        dataframe_name="data",
        dataframe=df,
        index="id",
        time_index="dates",
        logical_types=ltypes,
    )

    fm, _ = ft.dfs(
        entityset=pd_es,
        target_dataframe_name="data",
        trans_primitives=primitives_list,
        max_depth=1,
    )

    dask_fm = dask_fm.compute().astype({"id": "int64"})
    pd.testing.assert_frame_equal(fm,
                                  dask_fm.set_index("id").loc[fm.index],
                                  check_dtype=False)
示例#8
0
def test_all_ww_logical_types():
    logical_types = list_logical_types()['type_string'].to_list()
    dataframe = pd.DataFrame(columns=logical_types)
    es = EntitySet()
    ltype_dict = {ltype: ltype for ltype in logical_types}
    ltype_dict['ordinal'] = Ordinal(order=[])
    es.add_dataframe(dataframe=dataframe,
                     dataframe_name='all_types',
                     index='integer',
                     logical_types=ltype_dict)
    description = serialize.entityset_to_description(es)
    _es = deserialize.description_to_entityset(description)
    assert es.__eq__(_es, deep=True)
示例#9
0
def test_operations_invalidate_metadata(es):
    new_es = EntitySet(id="test")
    # test metadata gets created on access
    assert new_es._data_description is None
    assert new_es.metadata is not None  # generated after access
    assert new_es._data_description is not None
    if not isinstance(es['customers'], pd.DataFrame):
        customers_ltypes = es["customers"].ww.logical_types
        customers_ltypes['signup_date'] = Datetime
    else:
        customers_ltypes = None
    new_es.add_dataframe(es["customers"],
                         "customers",
                         index=es["customers"].index,
                         logical_types=customers_ltypes)
    if not isinstance(es['sessions'], pd.DataFrame):
        sessions_ltypes = es["sessions"].ww.logical_types
    else:
        sessions_ltypes = None
    new_es.add_dataframe(es["sessions"],
                         "sessions",
                         index=es["sessions"].index,
                         logical_types=sessions_ltypes)

    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es = new_es.add_relationship("customers", "id", "sessions",
                                     "customer_id")
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es = new_es.normalize_dataframe("customers", "cohort", "cohort")
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es.add_last_time_indexes()
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    # automatically adding interesting values not supported in Dask or Koalas
    if new_es.dataframe_type == Library.PANDAS.value:
        new_es.add_interesting_values()
        assert new_es._data_description is None
        assert new_es.metadata is not None
        assert new_es._data_description is not None
示例#10
0
def test_add_dataframe_with_make_index():
    values = [1, 12, -23, 27]
    df = pd.DataFrame({"values": values})
    spark_df = ps.from_pandas(df)
    spark_es = EntitySet(id="spark_es")
    ltypes = {"values": "Integer"}
    spark_es.add_dataframe(
        dataframe_name="new_dataframe",
        dataframe=spark_df,
        make_index=True,
        index="new_index",
        logical_types=ltypes,
    )

    expected_df = pd.DataFrame({"values": values, "new_index": range(len(values))})
    pd.testing.assert_frame_equal(expected_df, spark_es["new_dataframe"].to_pandas())
示例#11
0
def test_single_table_dask_entityset_with_instance_ids():
    primitives_list = ['absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words']
    instance_ids = [0, 1, 3]

    dask_es = EntitySet(id="dask_es")
    df = pd.DataFrame({"id": [0, 1, 2, 3],
                       "values": [1, 12, -34, 27],
                       "dates": [pd.to_datetime('2019-01-10'),
                                 pd.to_datetime('2019-02-03'),
                                 pd.to_datetime('2019-01-01'),
                                 pd.to_datetime('2017-08-25')],
                       "strings": ["I am a string",
                                   "23",
                                   "abcdef ghijk",
                                   ""]})

    values_dd = dd.from_pandas(df, npartitions=2)
    ltypes = {
        "values": Integer,
        "dates": Datetime,
        "strings": NaturalLanguage
    }
    dask_es.add_dataframe(
        dataframe_name="data",
        dataframe=values_dd,
        index="id",
        logical_types=ltypes)

    dask_fm, _ = ft.dfs(entityset=dask_es,
                        target_dataframe_name="data",
                        trans_primitives=primitives_list,
                        instance_ids=instance_ids)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(
        dataframe_name="data",
        dataframe=df,
        index="id",
        logical_types=ltypes)

    fm, _ = ft.dfs(entityset=pd_es,
                   target_dataframe_name="data",
                   trans_primitives=primitives_list,
                   instance_ids=instance_ids)

    # Make sure both indexes are sorted the same
    pd.testing.assert_frame_equal(fm, dask_fm.compute().set_index('id').loc[fm.index])
示例#12
0
def test_single_table_ks_entityset_ids_not_sorted():
    primitives_list = [
        'absolute', 'is_weekend', 'year', 'day', 'num_characters', 'num_words'
    ]

    ks_es = EntitySet(id="ks_es")
    df = pd.DataFrame({
        "id": [2, 0, 1, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    values_dd = ks.from_pandas(df)
    ltypes = {
        "values": Integer,
        "dates": Datetime,
        "strings": NaturalLanguage,
    }
    ks_es.add_dataframe(dataframe_name="data",
                        dataframe=values_dd,
                        index="id",
                        logical_types=ltypes)

    ks_fm, _ = ft.dfs(entityset=ks_es,
                      target_dataframe_name="data",
                      trans_primitives=primitives_list)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(dataframe_name="data",
                        dataframe=df,
                        index="id",
                        logical_types=ltypes)

    fm, _ = ft.dfs(entityset=pd_es,
                   target_dataframe_name="data",
                   trans_primitives=primitives_list)

    ks_computed_fm = ks_fm.to_pandas().set_index('id').loc[fm.index]
    # Koalas dtypes are different for categorical - set the pandas fm to have the same dtypes before comparing
    pd.testing.assert_frame_equal(fm.astype(ks_computed_fm.dtypes),
                                  ks_computed_fm)
示例#13
0
def test_add_dataframe_with_make_index():
    values = [1, 12, -23, 27]
    df = pd.DataFrame({"values": values})
    dask_df = dd.from_pandas(df, npartitions=2)
    dask_es = EntitySet(id="dask_es")
    logical_types = {"values": Integer}
    dask_es.add_dataframe(dataframe_name="new_dataframe",
                          dataframe=dask_df,
                          make_index=True,
                          index="new_index",
                          logical_types=logical_types)

    expected_df = pd.DataFrame({
        "values": values,
        "new_index": range(len(values))
    })
    pd.testing.assert_frame_equal(expected_df,
                                  dask_es['new_dataframe'].compute())
示例#14
0
def test_single_table_ks_entityset_dates_not_sorted():
    ks_es = EntitySet(id="ks_es")
    df = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "values": [1, 12, -34, 27],
        "dates": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ]
    })

    primitives_list = ['absolute', 'is_weekend', 'year', 'day']
    values_dd = ks.from_pandas(df)
    ltypes = {
        "values": Integer,
        "dates": Datetime,
    }
    ks_es.add_dataframe(dataframe_name="data",
                        dataframe=values_dd,
                        index="id",
                        time_index="dates",
                        logical_types=ltypes)

    ks_fm, _ = ft.dfs(entityset=ks_es,
                      target_dataframe_name="data",
                      trans_primitives=primitives_list,
                      max_depth=1)

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(dataframe_name="data",
                        dataframe=df,
                        index="id",
                        time_index="dates",
                        logical_types=ltypes)

    fm, _ = ft.dfs(entityset=pd_es,
                   target_dataframe_name="data",
                   trans_primitives=primitives_list,
                   max_depth=1)

    ks_fm = ks_fm.to_pandas().set_index('id').loc[fm.index]
    pd.testing.assert_frame_equal(fm.astype(ks_fm.dtypes), ks_fm)
示例#15
0
def description_to_entityset(description, **kwargs):
    """Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    """
    check_schema_version(description, "entityset")

    from featuretools.entityset import EntitySet

    # If data description was not read from disk, path is None.
    path = description.get("path")
    entityset = EntitySet(description["id"])

    for df in description["dataframes"].values():
        if path is not None:
            data_path = os.path.join(path, "data", df["name"])
            format = description.get("format")
            if format is not None:
                kwargs["format"] = format
                if format == "parquet" and df["loading_info"][
                        "table_type"] == "pandas":
                    kwargs["filename"] = df["name"] + ".parquet"
            dataframe = read_woodwork_table(data_path,
                                            validate=False,
                                            **kwargs)
        else:
            dataframe = empty_dataframe(df)

        entityset.add_dataframe(dataframe)

    for relationship in description["relationships"]:
        rel = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship=rel)

    return entityset
示例#16
0
def test_add_dataframe(pd_es):
    dask_es = EntitySet(id="dask_es")
    log_dask = dd.from_pandas(pd_es["log"], npartitions=2)
    dask_es = dask_es.add_dataframe(
        dataframe_name="log_dask",
        dataframe=log_dask,
        index="id",
        time_index="datetime",
        logical_types=pd_es["log"].ww.logical_types,
        semantic_tags=get_df_tags(pd_es["log"]))
    pd.testing.assert_frame_equal(pd_es["log"],
                                  dask_es["log_dask"].compute(),
                                  check_like=True)
示例#17
0
def test_with_custom_ww_logical_type():
    class CustomLogicalType(LogicalType):
        pass

    ww_type_system.add_type(CustomLogicalType)
    columns = ['integer', 'natural_language', 'custom_logical_type']
    dataframe = pd.DataFrame(columns=columns)
    es = EntitySet()
    ltype_dict = {
        'integer': 'integer',
        'natural_language': 'natural_language',
        'custom_logical_type': CustomLogicalType,
    }
    es.add_dataframe(dataframe=dataframe,
                     dataframe_name='custom_type',
                     index='integer',
                     logical_types=ltype_dict)
    description = serialize.entityset_to_description(es)
    _es = deserialize.description_to_entityset(description)
    assert isinstance(
        _es['custom_type'].ww.logical_types['custom_logical_type'],
        CustomLogicalType)
    assert es.__eq__(_es, deep=True)
示例#18
0
def test_add_dataframe_from_ks_df(pd_es):
    cleaned_df = pd_to_ks_clean(pd_es["log"])
    log_ks = ks.from_pandas(cleaned_df)

    ks_es = EntitySet(id="ks_es")
    ks_es = ks_es.add_dataframe(dataframe_name="log_ks",
                                dataframe=log_ks,
                                index="id",
                                time_index="datetime",
                                logical_types=pd_es["log"].ww.logical_types,
                                semantic_tags=get_df_tags(pd_es["log"]))
    pd.testing.assert_frame_equal(cleaned_df,
                                  ks_es["log_ks"].to_pandas(),
                                  check_like=True)
示例#19
0
def test_add_dataframe_from_spark_df(pd_es):
    cleaned_df = pd_to_spark_clean(pd_es["log"])
    log_spark = ps.from_pandas(cleaned_df)

    spark_es = EntitySet(id="spark_es")
    spark_es = spark_es.add_dataframe(
        dataframe_name="log_spark",
        dataframe=log_spark,
        index="id",
        time_index="datetime",
        logical_types=pd_es["log"].ww.logical_types,
        semantic_tags=get_df_tags(pd_es["log"]),
    )
    pd.testing.assert_frame_equal(
        cleaned_df, spark_es["log_spark"].to_pandas(), check_like=True
    )
示例#20
0
def test_add_last_time_indexes():
    pd_es = EntitySet(id="pd_es")
    spark_es = EntitySet(id="spark_es")

    sessions = pd.DataFrame(
        {
            "id": [0, 1, 2, 3],
            "user": [1, 2, 1, 3],
            "time": [
                pd.to_datetime("2019-01-10"),
                pd.to_datetime("2019-02-03"),
                pd.to_datetime("2019-01-01"),
                pd.to_datetime("2017-08-25"),
            ],
            "strings": ["I am a string", "23", "abcdef ghijk", ""],
        }
    )
    sessions_spark = ps.from_pandas(sessions)
    sessions_logical_types = {
        "id": Integer,
        "user": Integer,
        "strings": NaturalLanguage,
        "time": Datetime,
    }

    transactions = pd.DataFrame(
        {
            "id": [0, 1, 2, 3, 4, 5],
            "session_id": [0, 0, 1, 2, 2, 3],
            "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13],
            "time": [
                pd.to_datetime("2019-01-10 03:53"),
                pd.to_datetime("2019-01-10 04:12"),
                pd.to_datetime("2019-02-03 10:34"),
                pd.to_datetime("2019-01-01 12:35"),
                pd.to_datetime("2019-01-01 12:49"),
                pd.to_datetime("2017-08-25 04:53"),
            ],
        }
    )
    transactions_spark = ps.from_pandas(transactions)
    transactions_logical_types = {
        "id": Integer,
        "session_id": Integer,
        "amount": Double,
        "time": Datetime,
    }

    pd_es.add_dataframe(
        dataframe_name="sessions", dataframe=sessions, index="id", time_index="time"
    )
    spark_es.add_dataframe(
        dataframe_name="sessions",
        dataframe=sessions_spark,
        index="id",
        time_index="time",
        logical_types=sessions_logical_types,
    )

    pd_es.add_dataframe(
        dataframe_name="transactions",
        dataframe=transactions,
        index="id",
        time_index="time",
    )
    spark_es.add_dataframe(
        dataframe_name="transactions",
        dataframe=transactions_spark,
        index="id",
        time_index="time",
        logical_types=transactions_logical_types,
    )

    pd_es = pd_es.add_relationship("sessions", "id", "transactions", "session_id")
    spark_es = spark_es.add_relationship("sessions", "id", "transactions", "session_id")

    assert "foreign_key" in pd_es["transactions"].ww.semantic_tags["session_id"]
    assert "foreign_key" in spark_es["transactions"].ww.semantic_tags["session_id"]

    assert pd_es["sessions"].ww.metadata.get("last_time_index") is None
    assert spark_es["sessions"].ww.metadata.get("last_time_index") is None

    pd_es.add_last_time_indexes()
    spark_es.add_last_time_indexes()

    pd_lti_name = pd_es["sessions"].ww.metadata.get("last_time_index")
    spark_lti_name = spark_es["sessions"].ww.metadata.get("last_time_index")
    assert pd_lti_name == spark_lti_name
    pd.testing.assert_series_equal(
        pd_es["sessions"][pd_lti_name].sort_index(),
        spark_es["sessions"][spark_lti_name].to_pandas().sort_index(),
        check_names=False,
    )
示例#21
0
def make_ecommerce_entityset(with_integer_time_index=False):
    """Makes a entityset with the following shape:

      R         Regions
     / \\       .
    S   C       Stores, Customers
        |       .
        S   P   Sessions, Products
         \\ /   .
          L     Log
    """
    dataframes = make_ecommerce_dataframes(
        with_integer_time_index=with_integer_time_index
    )
    dataframe_names = dataframes.keys()
    es_id = "ecommerce"
    if with_integer_time_index:
        es_id += "_int_time_index"

    logical_types = make_logical_types(with_integer_time_index=with_integer_time_index)
    semantic_tags = make_semantic_tags()
    time_indexes = make_time_indexes(with_integer_time_index=with_integer_time_index)

    es = EntitySet(id=es_id)

    for df_name in dataframe_names:
        time_index = time_indexes.get(df_name, None)
        ti_name = None
        secondary = None
        if time_index is not None:
            ti_name = time_index["name"]
            secondary = time_index["secondary"]
        df = dataframes[df_name]
        es.add_dataframe(
            df,
            dataframe_name=df_name,
            index="id",
            logical_types=logical_types[df_name],
            semantic_tags=semantic_tags[df_name],
            time_index=ti_name,
            secondary_time_index=secondary,
        )

    es.normalize_dataframe(
        "customers",
        "cohorts",
        "cohort",
        additional_columns=["cohort_name"],
        make_time_index=True,
        new_dataframe_time_index="cohort_end",
    )

    es.add_relationships(
        [
            ("régions", "id", "customers", "région_id"),
            ("régions", "id", "stores", "région_id"),
            ("customers", "id", "sessions", "customer_id"),
            ("sessions", "id", "log", "session_id"),
            ("products", "id", "log", "product_id"),
        ]
    )

    return es
示例#22
0
def test_add_last_time_indexes():
    pd_es = EntitySet(id="pd_es")
    dask_es = EntitySet(id="dask_es")

    sessions = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "user": [1, 2, 1, 3],
        "time": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    sessions_dask = dd.from_pandas(sessions, npartitions=2)
    sessions_logical_types = {
        "id": Integer,
        "user": Integer,
        "time": Datetime,
        "strings": NaturalLanguage
    }

    transactions = pd.DataFrame({
        "id": [0, 1, 2, 3, 4, 5],
        "session_id": [0, 0, 1, 2, 2, 3],
        "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13],
        "time": [
            pd.to_datetime('2019-01-10 03:53'),
            pd.to_datetime('2019-01-10 04:12'),
            pd.to_datetime('2019-02-03 10:34'),
            pd.to_datetime('2019-01-01 12:35'),
            pd.to_datetime('2019-01-01 12:49'),
            pd.to_datetime('2017-08-25 04:53')
        ]
    })
    transactions_dask = dd.from_pandas(transactions, npartitions=2)

    transactions_logical_types = {
        "id": Integer,
        "session_id": Integer,
        "time": Datetime,
        "amount": Double
    }

    pd_es.add_dataframe(dataframe_name="sessions",
                        dataframe=sessions,
                        index="id",
                        time_index="time")
    dask_es.add_dataframe(dataframe_name="sessions",
                          dataframe=sessions_dask,
                          index="id",
                          time_index="time",
                          logical_types=sessions_logical_types)

    pd_es.add_dataframe(dataframe_name="transactions",
                        dataframe=transactions,
                        index="id",
                        time_index="time")
    dask_es.add_dataframe(dataframe_name="transactions",
                          dataframe=transactions_dask,
                          index="id",
                          time_index="time",
                          logical_types=transactions_logical_types)

    pd_es = pd_es.add_relationship("sessions", "id", "transactions",
                                   "session_id")
    dask_es = dask_es.add_relationship("sessions", "id", "transactions",
                                       "session_id")

    assert 'foreign_key' in pd_es['transactions'].ww.semantic_tags[
        'session_id']
    assert 'foreign_key' in dask_es['transactions'].ww.semantic_tags[
        'session_id']

    assert pd_es['sessions'].ww.metadata.get('last_time_index') is None
    assert dask_es['sessions'].ww.metadata.get('last_time_index') is None

    pd_es.add_last_time_indexes()
    dask_es.add_last_time_indexes()

    pd_lti_name = pd_es['sessions'].ww.metadata.get('last_time_index')
    ks_lti_name = dask_es['sessions'].ww.metadata.get('last_time_index')
    assert pd_lti_name == ks_lti_name
    pd.testing.assert_series_equal(
        pd_es['sessions'][pd_lti_name].sort_index(),
        dask_es['sessions'][ks_lti_name].compute().sort_index(),
        check_names=False)
示例#23
0
def test_single_table_spark_entityset_cutoff_time_df():
    primitives_list = [
        "absolute",
        "is_weekend",
        "year",
        "day",
        "num_characters",
        "num_words",
    ]

    spark_es = EntitySet(id="spark_es")
    df = pd.DataFrame({
        "id": [0, 1, 2],
        "values": [1, 12, -34],
        "dates": [
            pd.to_datetime("2019-01-10"),
            pd.to_datetime("2019-02-03"),
            pd.to_datetime("2019-01-01"),
        ],
        "strings": ["I am a string", "23", "abcdef ghijk"],
    })
    values_dd = ps.from_pandas(df)
    ltypes = {
        "values": IntegerNullable,
        "dates": Datetime,
        "strings": NaturalLanguage
    }
    spark_es.add_dataframe(
        dataframe_name="data",
        dataframe=values_dd,
        index="id",
        time_index="dates",
        logical_types=ltypes,
    )

    ids = [0, 1, 2, 0]
    times = [
        pd.Timestamp("2019-01-05 04:00"),
        pd.Timestamp("2019-01-05 04:00"),
        pd.Timestamp("2019-01-05 04:00"),
        pd.Timestamp("2019-01-15 04:00"),
    ]
    labels = [True, False, True, False]
    cutoff_times = pd.DataFrame({
        "id": ids,
        "time": times,
        "labels": labels
    },
                                columns=["id", "time", "labels"])

    spark_fm, _ = ft.dfs(
        entityset=spark_es,
        target_dataframe_name="data",
        trans_primitives=primitives_list,
        cutoff_time=cutoff_times,
    )

    pd_es = ft.EntitySet(id="pd_es")
    pd_es.add_dataframe(
        dataframe_name="data",
        dataframe=df,
        index="id",
        time_index="dates",
        logical_types=ltypes,
    )

    fm, _ = ft.dfs(
        entityset=pd_es,
        target_dataframe_name="data",
        trans_primitives=primitives_list,
        cutoff_time=cutoff_times,
    )
    # Because row ordering with spark is not guaranteed, `we need to sort on two columns to make sure that values
    # for instance id 0 are compared correctly. Also, make sure the index column has the same dtype.
    fm = fm.sort_values(["id", "labels"])
    spark_fm = spark_fm.to_pandas().astype({"id": "int64"})
    spark_fm = spark_fm.set_index("id").sort_values(["id", "labels"])

    for column in fm.columns:
        if fm[column].dtype.name == "category":
            fm[column] = fm[column].astype("Int64").astype("string")

    pd.testing.assert_frame_equal(fm.astype(spark_fm.dtypes),
                                  spark_fm,
                                  check_dtype=False)