예제 #1
0
def make_ecommerce_entityset(with_integer_time_index=False):
    """ Makes a entityset with the following shape:

          R         Regions
         / \\       .
        S   C       Stores, Customers
            |       .
            S   P   Sessions, Products
             \\ /   .
              L     Log
    """
    dataframes = make_ecommerce_dataframes(
        with_integer_time_index=with_integer_time_index)
    entities = dataframes.keys()
    es_id = 'ecommerce'
    if with_integer_time_index:
        es_id += "_int_time_index"

    variable_types = make_variable_types(
        with_integer_time_index=with_integer_time_index)
    time_indexes = make_time_indexes(
        with_integer_time_index=with_integer_time_index)

    es = EntitySet(id=es_id)

    for entity in entities:
        time_index = time_indexes.get(entity, None)
        ti_name = None
        secondary = None
        if time_index is not None:
            ti_name = time_index['name']
            secondary = time_index['secondary']
        df = dataframes[entity]
        es.entity_from_dataframe(entity,
                                 df,
                                 index='id',
                                 variable_types=variable_types[entity],
                                 time_index=ti_name,
                                 secondary_time_index=secondary)

    es.normalize_entity('customers',
                        'cohorts',
                        'cohort',
                        additional_variables=['cohort_name'],
                        make_time_index=True,
                        new_entity_time_index='cohort_end')

    es.add_relationships([
        Relationship(es[u'régions']['id'], es['customers'][u'région_id']),
        Relationship(es[u'régions']['id'], es['stores'][u'région_id']),
        Relationship(es['customers']['id'], es['sessions']['customer_id']),
        Relationship(es['sessions']['id'], es['log']['session_id']),
        Relationship(es['products']['id'], es['log']['product_id'])
    ])

    return es
예제 #2
0
def test_sets_time_when_adding_entity():
    transactions_df = pd.DataFrame({
        "id": [1, 2, 3, 4, 5, 6],
        "card_id": [1, 2, 1, 3, 4, 5],
        "transaction_time": [10, 12, 13, 20, 21, 20],
        "fraud": [True, False, False, False, True, True]
    })
    accounts_df = pd.DataFrame({
        "id": [3, 4, 5],
        "signup_date":
        [datetime(2002, 5, 1),
         datetime(2006, 3, 20),
         datetime(2011, 11, 11)]
    })
    accounts_df_string = pd.DataFrame({
        "id": [3, 4, 5],
        "signup_date": ["element", "exporting", "editable"]
    })
    # create empty entityset
    es = EntitySet("fraud")
    # assert it's not set
    assert getattr(es, "time_type", None) is None
    # add entity
    es.entity_from_dataframe("transactions",
                             transactions_df,
                             index="id",
                             time_index="transaction_time")
    # assert time_type is set
    assert es.time_type == variable_types.NumericTimeIndex
    # add another entity
    es.normalize_entity("transactions",
                        "cards",
                        "card_id",
                        make_time_index=True)
    # assert time_type unchanged
    assert es.time_type == variable_types.NumericTimeIndex
    # add wrong time type entity
    error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error_text):
        es.entity_from_dataframe("accounts",
                                 accounts_df,
                                 index="id",
                                 time_index="signup_date")
    # add non time type as time index
    error_text = "Attempted to convert all string column signup_date to numeric"
    with pytest.raises(TypeError, match=error_text):
        es.entity_from_dataframe("accounts",
                                 accounts_df_string,
                                 index="id",
                                 time_index="signup_date")
예제 #3
0
 def test_sets_time_when_adding_entity(self):
     transactions_df = pd.DataFrame({
         "id": [1, 2, 3, 4, 5, 6],
         "card_id": [1, 2, 1, 3, 4, 5],
         "transaction_time": [10, 12, 13, 20, 21, 20],
         "fraud": [True, False, False, False, True, True]
     })
     accounts_df = pd.DataFrame({
         "id": [3, 4, 5],
         "signup_date": [
             datetime(2002, 5, 1),
             datetime(2006, 3, 20),
             datetime(2011, 11, 11)
         ]
     })
     accounts_df_string = pd.DataFrame({
         "id": [3, 4, 5],
         "signup_date": ["element", "exporting", "editable"]
     })
     # create empty entityset
     entityset = EntitySet("fraud")
     # assert it's not set
     assert getattr(entityset, "time_type", None) is None
     # add entity
     entityset.entity_from_dataframe("transactions",
                                     transactions_df,
                                     index="id",
                                     time_index="transaction_time")
     # assert time_type is set
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add another entity
     entityset.normalize_entity("transactions",
                                "cards",
                                "card_id",
                                make_time_index=True)
     # assert time_type unchanged
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add wrong time type entity
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df,
                                         index="id",
                                         time_index="signup_date")
     # add non time type as time index
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df_string,
                                         index="id",
                                         time_index="signup_date")
예제 #4
0
def test_sets_time_when_adding_entity():
    transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                    "card_id": [1, 2, 1, 3, 4, 5],
                                    "transaction_time": [10, 12, 13, 20, 21, 20],
                                    "fraud": [True, False, False, False, True, True]})
    accounts_df = pd.DataFrame({"id": [3, 4, 5],
                                "signup_date": [datetime(2002, 5, 1),
                                                datetime(2006, 3, 20),
                                                datetime(2011, 11, 11)]})
    accounts_df_string = pd.DataFrame({"id": [3, 4, 5],
                                       "signup_date": ["element",
                                                       "exporting",
                                                       "editable"]})
    # create empty entityset
    entityset = EntitySet("fraud")
    # assert it's not set
    assert getattr(entityset, "time_type", None) is None
    # add entity
    entityset.entity_from_dataframe("transactions",
                                    transactions_df,
                                    index="id",
                                    time_index="transaction_time")
    # assert time_type is set
    assert entityset.time_type == variable_types.NumericTimeIndex
    # add another entity
    entityset.normalize_entity("transactions",
                               "cards",
                               "card_id",
                               make_time_index=True)
    # assert time_type unchanged
    assert entityset.time_type == variable_types.NumericTimeIndex
    # add wrong time type entity
    error_text = "accounts time index is <class 'featuretools.variable_types.variable.DatetimeTimeIndex'> type which differs from other entityset time indexes"
    with pytest.raises(TypeError, match=error_text):
        entityset.entity_from_dataframe("accounts",
                                        accounts_df,
                                        index="id",
                                        time_index="signup_date")
    # add non time type as time index
    error_text = "Attempted to convert all string column signup_date to numeric"
    with pytest.raises(TypeError, match=error_text):
        entityset.entity_from_dataframe("accounts",
                                        accounts_df_string,
                                        index="id",
                                        time_index="signup_date")
예제 #5
0
def test_make_time_index_keeps_original_sorting():
    trips = {
        'trip_id': [999 - i for i in range(1000)],
        'flight_time': [datetime(1997, 4, 1) for i in range(1000)],
        'flight_id': [1 for i in range(350)] + [2 for i in range(650)]
    }
    order = [i for i in range(1000)]
    df = pd.DataFrame.from_dict(trips)
    es = EntitySet('flights')
    es.entity_from_dataframe("trips",
                             dataframe=df,
                             index="trip_id",
                             time_index='flight_time')
    assert (es['trips'].df['trip_id'] == order).all()
    es.normalize_entity(base_entity_id="trips",
                        new_entity_id="flights",
                        index="flight_id",
                        make_time_index=True)
    assert (es['trips'].df['trip_id'] == order).all()
예제 #6
0
def test_make_time_index_keeps_original_sorting():
    trips = {
        'trip_id': [999 - i for i in range(1000)],
        'flight_time': [datetime(1997, 4, 1) for i in range(1000)],
        'flight_id': [1 for i in range(350)] + [2 for i in range(650)]
    }
    order = [i for i in range(1000)]
    df = pd.DataFrame.from_dict(trips)
    es = EntitySet('flights')
    es.entity_from_dataframe("trips",
                             dataframe=df,
                             index="trip_id",
                             time_index='flight_time')
    assert (es['trips'].df['trip_id'] == order).all()
    es.normalize_entity(base_entity_id="trips",
                        new_entity_id="flights",
                        index="flight_id",
                        make_time_index=True)
    assert (es['trips'].df['trip_id'] == order).all()
예제 #7
0
 def test_sets_time_when_adding_entity(self):
     transactions_df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6],
                                     "card_id": [1, 2, 1, 3, 4, 5],
                                     "transaction_time": [10, 12, 13, 20, 21, 20],
                                     "fraud": [True, False, False, False, True, True]})
     accounts_df = pd.DataFrame({"id": [3, 4, 5],
                                 "signup_date": [datetime(2002, 5, 1),
                                                 datetime(2006, 3, 20),
                                                 datetime(2011, 11, 11)]})
     accounts_df_string = pd.DataFrame({"id": [3, 4, 5],
                                        "signup_date": ["element",
                                                        "exporting",
                                                        "editable"]})
     # create empty entityset
     entityset = EntitySet("fraud")
     # assert it's not set
     assert getattr(entityset, "time_type", None) is None
     # add entity
     entityset.entity_from_dataframe("transactions",
                                     transactions_df,
                                     index="id",
                                     time_index="transaction_time")
     # assert time_type is set
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add another entity
     entityset.normalize_entity("transactions",
                                "cards",
                                "card_id",
                                make_time_index=True)
     # assert time_type unchanged
     assert entityset.time_type == variable_types.NumericTimeIndex
     # add wrong time type entity
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df,
                                         index="id",
                                         time_index="signup_date")
     # add non time type as time index
     with pytest.raises(TypeError):
         entityset.entity_from_dataframe("accounts",
                                         accounts_df_string,
                                         index="id",
                                         time_index="signup_date")
예제 #8
0
def test_operations_invalidate_metadata(es):
    new_es = EntitySet(id="test")
    # test metadata gets created on access
    assert new_es._data_description is None
    assert new_es.metadata is not None  # generated after access
    assert new_es._data_description is not None
    if not isinstance(es['customers'].df, pd.DataFrame):
        customers_vtypes = es["customers"].variable_types
        customers_vtypes['signup_date'] = variable_types.Datetime
    else:
        customers_vtypes = None
    new_es.entity_from_dataframe("customers",
                                 es["customers"].df,
                                 index=es["customers"].index,
                                 variable_types=customers_vtypes)
    if not isinstance(es['sessions'].df, pd.DataFrame):
        sessions_vtypes = es["sessions"].variable_types
    else:
        sessions_vtypes = None
    new_es.entity_from_dataframe("sessions",
                                 es["sessions"].df,
                                 index=es["sessions"].index,
                                 variable_types=sessions_vtypes)
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    r = Relationship(new_es["customers"]["id"],
                     new_es["sessions"]["customer_id"])
    new_es = new_es.add_relationship(r)
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es = new_es.normalize_entity("customers", "cohort", "cohort")
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    new_es.add_last_time_indexes()
    assert new_es._data_description is None
    assert new_es.metadata is not None
    assert new_es._data_description is not None

    # automatically adding interesting values not supported in Dask or Koalas
    if any(isinstance(entity.df, pd.DataFrame) for entity in new_es.entities):
        new_es.add_interesting_values()
        assert new_es._data_description is None
        assert new_es.metadata is not None
        assert new_es._data_description is not None