示例#1
0
def description_to_entityset(description, **kwargs):
    '''Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    '''
    check_schema_version(description, 'entityset')

    from featuretools.entityset import EntitySet
    # If data description was not read from disk, path is None.
    path = description.get('path')
    entityset = EntitySet(description['id'])

    last_time_index = []
    for entity in description['entities'].values():
        entity['loading_info']['params'].update(kwargs)
        # If path is None, an empty dataframe will be created for entity.
        description_to_entity(entity, entityset, path=path)
        if entity['properties']['last_time_index']:
            last_time_index.append(entity['id'])

    for relationship in description['relationships']:
        relationship = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship)

    if len(last_time_index):
        entityset.add_last_time_indexes(updated_entities=last_time_index)

    return entityset
示例#2
0
def test_add_last_time_indexes():
    pd_es = EntitySet(id="pd_es")
    dask_es = EntitySet(id="dask_es")

    sessions = pd.DataFrame({"id": [0, 1, 2, 3],
                             "user": [1, 2, 1, 3],
                             "time": [pd.to_datetime('2019-01-10'),
                                      pd.to_datetime('2019-02-03'),
                                      pd.to_datetime('2019-01-01'),
                                      pd.to_datetime('2017-08-25')],
                             "strings": ["I am a string",
                                         "23",
                                         "abcdef ghijk",
                                         ""]})
    sessions_dask = dd.from_pandas(sessions, npartitions=2)
    sessions_vtypes = {
        "id": ft.variable_types.Id,
        "user": ft.variable_types.Id,
        "time": ft.variable_types.DatetimeTimeIndex,
        "strings": ft.variable_types.NaturalLanguage
    }

    transactions = pd.DataFrame({"id": [0, 1, 2, 3, 4, 5],
                                 "session_id": [0, 0, 1, 2, 2, 3],
                                 "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13],
                                 "time": [pd.to_datetime('2019-01-10 03:53'),
                                          pd.to_datetime('2019-01-10 04:12'),
                                          pd.to_datetime('2019-02-03 10:34'),
                                          pd.to_datetime('2019-01-01 12:35'),
                                          pd.to_datetime('2019-01-01 12:49'),
                                          pd.to_datetime('2017-08-25 04:53')]})
    transactions_dask = dd.from_pandas(transactions, npartitions=2)
    transactions_vtypes = {
        "id": ft.variable_types.Id,
        "session_id": ft.variable_types.Id,
        "amount": ft.variable_types.Numeric,
        "time": ft.variable_types.DatetimeTimeIndex,
    }

    pd_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions, index="id", time_index="time")
    dask_es.entity_from_dataframe(entity_id="sessions", dataframe=sessions_dask, index="id", time_index="time", variable_types=sessions_vtypes)

    pd_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions, index="id", time_index="time")
    dask_es.entity_from_dataframe(entity_id="transactions", dataframe=transactions_dask, index="id", time_index="time", variable_types=transactions_vtypes)

    new_rel = Relationship(pd_es["sessions"]["id"],
                           pd_es["transactions"]["session_id"])
    dask_rel = Relationship(dask_es["sessions"]["id"],
                            dask_es["transactions"]["session_id"])

    pd_es = pd_es.add_relationship(new_rel)
    dask_es = dask_es.add_relationship(dask_rel)

    assert pd_es['sessions'].last_time_index is None
    assert dask_es['sessions'].last_time_index is None

    pd_es.add_last_time_indexes()
    dask_es.add_last_time_indexes()

    pd.testing.assert_series_equal(pd_es['sessions'].last_time_index.sort_index(), dask_es['sessions'].last_time_index.compute(), check_names=False)
示例#3
0
def description_to_entityset(description, **kwargs):
    '''Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    '''
    check_schema_version(description, 'entityset')

    from featuretools.entityset import EntitySet

    # If data description was not read from disk, path is None.
    path = description.get('path')
    entityset = EntitySet(description['id'])

    for df in description['dataframes'].values():
        if path is not None:
            data_path = os.path.join(path, 'data', df['name'])
            dataframe = read_woodwork_table(data_path,
                                            validate=False,
                                            **kwargs)
        else:
            dataframe = empty_dataframe(df)

        entityset.add_dataframe(dataframe)

    for relationship in description['relationships']:
        rel = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship=rel)

    return entityset
示例#4
0
def datetime_es():
    cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]})
    transactions_df = pd.DataFrame(
        {
            "id": [1, 2, 3, 4, 5],
            "card_id": [1, 1, 5, 1, 5],
            "transaction_time": pd.to_datetime(
                [
                    "2011-2-28 04:00",
                    "2012-2-28 05:00",
                    "2012-2-29 06:00",
                    "2012-3-1 08:00",
                    "2014-4-1 10:00",
                ]
            ),
            "fraud": [True, False, False, False, True],
        }
    )

    datetime_es = EntitySet(id="fraud_data")
    datetime_es = datetime_es.add_dataframe(
        dataframe_name="transactions",
        dataframe=transactions_df,
        index="id",
        time_index="transaction_time",
    )

    datetime_es = datetime_es.add_dataframe(
        dataframe_name="cards", dataframe=cards_df, index="id"
    )

    datetime_es = datetime_es.add_relationship("cards", "id", "transactions", "card_id")
    datetime_es.add_last_time_indexes()
    return datetime_es
示例#5
0
def datetime_es():
    cards_df = pd.DataFrame({"id": [1, 2, 3, 4, 5]})
    transactions_df = pd.DataFrame({
        "id": [1, 2, 3, 4, 5],
        "card_id": [1, 1, 5, 1, 5],
        "transaction_time":
        pd.to_datetime([
            '2011-2-28 04:00', '2012-2-28 05:00', '2012-2-29 06:00',
            '2012-3-1 08:00', '2014-4-1 10:00'
        ]),
        "fraud": [True, False, False, False, True]
    })

    datetime_es = EntitySet(id="fraud_data")
    datetime_es = datetime_es.entity_from_dataframe(
        entity_id="transactions",
        dataframe=transactions_df,
        index="id",
        time_index="transaction_time")

    datetime_es = datetime_es.entity_from_dataframe(entity_id="cards",
                                                    dataframe=cards_df,
                                                    index="id")
    relationship = Relationship(datetime_es["cards"]["id"],
                                datetime_es["transactions"]["card_id"])
    datetime_es = datetime_es.add_relationship(relationship)
    datetime_es.add_last_time_indexes()
    return datetime_es
示例#6
0
def description_to_entityset(description, **kwargs):
    """Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    """
    check_schema_version(description, "entityset")

    from featuretools.entityset import EntitySet

    # If data description was not read from disk, path is None.
    path = description.get("path")
    entityset = EntitySet(description["id"])

    for df in description["dataframes"].values():
        if path is not None:
            data_path = os.path.join(path, "data", df["name"])
            format = description.get("format")
            if format is not None:
                kwargs["format"] = format
                if format == "parquet" and df["loading_info"][
                        "table_type"] == "pandas":
                    kwargs["filename"] = df["name"] + ".parquet"
            dataframe = read_woodwork_table(data_path,
                                            validate=False,
                                            **kwargs)
        else:
            dataframe = empty_dataframe(df)

        entityset.add_dataframe(dataframe)

    for relationship in description["relationships"]:
        rel = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship=rel)

    return entityset
示例#7
0
def test_operations_invalidate_metadata(es):
    new_es = EntitySet(id="test")
    # test metadata gets created on access
    assert new_es._data_description is None
    assert new_es.metadata is not None  # generated after access
    assert new_es._data_description is not None
    if not isinstance(es['customers'].df, pd.DataFrame):
        customers_vtypes = es["customers"].variable_types
        customers_vtypes['signup_date'] = variable_types.Datetime
    else:
        customers_vtypes = None
    new_es.entity_from_dataframe("customers",
                                 es["customers"].df,
                                 index=es["customers"].index,
                                 variable_types=customers_vtypes)
    if not isinstance(es['sessions'].df, pd.DataFrame):
        sessions_vtypes = es["sessions"].variable_types
    else:
        sessions_vtypes = None
    new_es.entity_from_dataframe("sessions",
                                 es["sessions"].df,
                                 index=es["sessions"].index,
                                 variable_types=sessions_vtypes)
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    r = Relationship(new_es["customers"]["id"],
                     new_es["sessions"]["customer_id"])
    new_es = new_es.add_relationship(r)
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es = new_es.normalize_entity("customers", "cohort", "cohort")
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es.add_last_time_indexes()
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    # automatically adding interesting values not supported in Dask or Koalas
    if any(isinstance(entity.df, pd.DataFrame) for entity in new_es.entities):
        new_es.add_interesting_values()
        assert new_es._data_description is None
        assert new_es.metadata is not None
        assert new_es._data_description is not None
示例#8
0
def test_operations_invalidate_metadata(es):
    new_es = EntitySet(id="test")
    # test metadata gets created on access
    assert new_es._data_description is None
    assert new_es.metadata is not None  # generated after access
    assert new_es._data_description is not None
    if not isinstance(es['customers'], pd.DataFrame):
        customers_ltypes = es["customers"].ww.logical_types
        customers_ltypes['signup_date'] = Datetime
    else:
        customers_ltypes = None
    new_es.add_dataframe(es["customers"],
                         "customers",
                         index=es["customers"].index,
                         logical_types=customers_ltypes)
    if not isinstance(es['sessions'], pd.DataFrame):
        sessions_ltypes = es["sessions"].ww.logical_types
    else:
        sessions_ltypes = None
    new_es.add_dataframe(es["sessions"],
                         "sessions",
                         index=es["sessions"].index,
                         logical_types=sessions_ltypes)

    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es = new_es.add_relationship("customers", "id", "sessions",
                                     "customer_id")
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es = new_es.normalize_dataframe("customers", "cohort", "cohort")
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es.add_last_time_indexes()
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    # automatically adding interesting values not supported in Dask or Koalas
    if new_es.dataframe_type == Library.PANDAS.value:
        new_es.add_interesting_values()
        assert new_es._data_description is None
        assert new_es.metadata is not None
        assert new_es._data_description is not None
示例#9
0
def test_add_last_time_indexes():
    pd_es = EntitySet(id="pd_es")
    dask_es = EntitySet(id="dask_es")

    sessions = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "user": [1, 2, 1, 3],
        "time": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    sessions_dask = dd.from_pandas(sessions, npartitions=2)
    sessions_logical_types = {
        "id": Integer,
        "user": Integer,
        "time": Datetime,
        "strings": NaturalLanguage
    }

    transactions = pd.DataFrame({
        "id": [0, 1, 2, 3, 4, 5],
        "session_id": [0, 0, 1, 2, 2, 3],
        "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13],
        "time": [
            pd.to_datetime('2019-01-10 03:53'),
            pd.to_datetime('2019-01-10 04:12'),
            pd.to_datetime('2019-02-03 10:34'),
            pd.to_datetime('2019-01-01 12:35'),
            pd.to_datetime('2019-01-01 12:49'),
            pd.to_datetime('2017-08-25 04:53')
        ]
    })
    transactions_dask = dd.from_pandas(transactions, npartitions=2)

    transactions_logical_types = {
        "id": Integer,
        "session_id": Integer,
        "time": Datetime,
        "amount": Double
    }

    pd_es.add_dataframe(dataframe_name="sessions",
                        dataframe=sessions,
                        index="id",
                        time_index="time")
    dask_es.add_dataframe(dataframe_name="sessions",
                          dataframe=sessions_dask,
                          index="id",
                          time_index="time",
                          logical_types=sessions_logical_types)

    pd_es.add_dataframe(dataframe_name="transactions",
                        dataframe=transactions,
                        index="id",
                        time_index="time")
    dask_es.add_dataframe(dataframe_name="transactions",
                          dataframe=transactions_dask,
                          index="id",
                          time_index="time",
                          logical_types=transactions_logical_types)

    pd_es = pd_es.add_relationship("sessions", "id", "transactions",
                                   "session_id")
    dask_es = dask_es.add_relationship("sessions", "id", "transactions",
                                       "session_id")

    assert 'foreign_key' in pd_es['transactions'].ww.semantic_tags[
        'session_id']
    assert 'foreign_key' in dask_es['transactions'].ww.semantic_tags[
        'session_id']

    assert pd_es['sessions'].ww.metadata.get('last_time_index') is None
    assert dask_es['sessions'].ww.metadata.get('last_time_index') is None

    pd_es.add_last_time_indexes()
    dask_es.add_last_time_indexes()

    pd_lti_name = pd_es['sessions'].ww.metadata.get('last_time_index')
    ks_lti_name = dask_es['sessions'].ww.metadata.get('last_time_index')
    assert pd_lti_name == ks_lti_name
    pd.testing.assert_series_equal(
        pd_es['sessions'][pd_lti_name].sort_index(),
        dask_es['sessions'][ks_lti_name].compute().sort_index(),
        check_names=False)
示例#10
0
def test_add_last_time_indexes():
    pd_es = EntitySet(id="pd_es")
    spark_es = EntitySet(id="spark_es")

    sessions = pd.DataFrame(
        {
            "id": [0, 1, 2, 3],
            "user": [1, 2, 1, 3],
            "time": [
                pd.to_datetime("2019-01-10"),
                pd.to_datetime("2019-02-03"),
                pd.to_datetime("2019-01-01"),
                pd.to_datetime("2017-08-25"),
            ],
            "strings": ["I am a string", "23", "abcdef ghijk", ""],
        }
    )
    sessions_spark = ps.from_pandas(sessions)
    sessions_logical_types = {
        "id": Integer,
        "user": Integer,
        "strings": NaturalLanguage,
        "time": Datetime,
    }

    transactions = pd.DataFrame(
        {
            "id": [0, 1, 2, 3, 4, 5],
            "session_id": [0, 0, 1, 2, 2, 3],
            "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13],
            "time": [
                pd.to_datetime("2019-01-10 03:53"),
                pd.to_datetime("2019-01-10 04:12"),
                pd.to_datetime("2019-02-03 10:34"),
                pd.to_datetime("2019-01-01 12:35"),
                pd.to_datetime("2019-01-01 12:49"),
                pd.to_datetime("2017-08-25 04:53"),
            ],
        }
    )
    transactions_spark = ps.from_pandas(transactions)
    transactions_logical_types = {
        "id": Integer,
        "session_id": Integer,
        "amount": Double,
        "time": Datetime,
    }

    pd_es.add_dataframe(
        dataframe_name="sessions", dataframe=sessions, index="id", time_index="time"
    )
    spark_es.add_dataframe(
        dataframe_name="sessions",
        dataframe=sessions_spark,
        index="id",
        time_index="time",
        logical_types=sessions_logical_types,
    )

    pd_es.add_dataframe(
        dataframe_name="transactions",
        dataframe=transactions,
        index="id",
        time_index="time",
    )
    spark_es.add_dataframe(
        dataframe_name="transactions",
        dataframe=transactions_spark,
        index="id",
        time_index="time",
        logical_types=transactions_logical_types,
    )

    pd_es = pd_es.add_relationship("sessions", "id", "transactions", "session_id")
    spark_es = spark_es.add_relationship("sessions", "id", "transactions", "session_id")

    assert "foreign_key" in pd_es["transactions"].ww.semantic_tags["session_id"]
    assert "foreign_key" in spark_es["transactions"].ww.semantic_tags["session_id"]

    assert pd_es["sessions"].ww.metadata.get("last_time_index") is None
    assert spark_es["sessions"].ww.metadata.get("last_time_index") is None

    pd_es.add_last_time_indexes()
    spark_es.add_last_time_indexes()

    pd_lti_name = pd_es["sessions"].ww.metadata.get("last_time_index")
    spark_lti_name = spark_es["sessions"].ww.metadata.get("last_time_index")
    assert pd_lti_name == spark_lti_name
    pd.testing.assert_series_equal(
        pd_es["sessions"][pd_lti_name].sort_index(),
        spark_es["sessions"][spark_lti_name].to_pandas().sort_index(),
        check_names=False,
    )