예제 #1
0
    def test_multiple_children_both_missing(self, entityset, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test all instances in neither child
        sessions = entityset['sessions']

        # add row to sessions to create session with no events
        sessions.update_data(extra_session_df)

        entityset.entity_from_dataframe(entity_id="wishlist_log",
                                        dataframe=wishlist_df,
                                        index='id',
                                        make_index=True,
                                        time_index='datetime')
        relationship = Relationship(entityset['sessions']['id'],
                                    entityset['wishlist_log']['session_id'])
        entityset.add_relationship(relationship)
        entityset.add_last_time_indexes()
        sessions = entityset['sessions']

        # wishlist has 2 newer events and one is NaT
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00")
        true_sessions_lti[6] = pd.NaT

        assert len(sessions.last_time_index) == 7
        sorted_lti = sessions.last_time_index.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
예제 #2
0
def test_add_relationship_errors_on_dtype_mismatch(es):
    log_2_df = es['log'].df.copy()
    log_variable_types = {
        'id': variable_types.Categorical,
        'session_id': variable_types.Id,
        'product_id': variable_types.Id,
        'datetime': variable_types.Datetime,
        'value': variable_types.Numeric,
        'value_2': variable_types.Numeric,
        'latlong': variable_types.LatLong,
        'latlong2': variable_types.LatLong,
        'value_many_nans': variable_types.Numeric,
        'priority_level': variable_types.Ordinal,
        'purchased': variable_types.Boolean,
        'comments': variable_types.Text
    }
    es.entity_from_dataframe(entity_id='log2',
                             dataframe=log_2_df,
                             index='id',
                             variable_types=log_variable_types,
                             time_index='datetime',
                             encoding='utf-8')
    with pytest.raises(ValueError) as e:
        mismatch = Relationship(es['regions']['id'], es['log2']['session_id'])
        es.add_relationship(mismatch)

    assert e.value.__str__() == "Unable to add relationship because id in "\
                                "regions is Pandas dtype object and "\
                                "session_id in log2 is Pandas dtype int64."
예제 #3
0
    def test_multiple_children_left_missing(self, entityset, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test all instances in right child
        sessions = entityset['sessions']

        # add row to sessions so not all session instances are in log
        sessions.update_data(extra_session_df)

        # add row to wishlist df so new session instance in in wishlist_log
        row_values = {'session_id': 6,
                      'datetime': pd.Timestamp("2011-04-11 11:11:11"),
                      'product_id': 'toothpaste'}
        row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8))
        df = wishlist_df.append(row)
        entityset.entity_from_dataframe(entity_id="wishlist_log",
                                        dataframe=df,
                                        index='id',
                                        make_index=True,
                                        time_index='datetime')
        relationship = Relationship(entityset['sessions']['id'],
                                    entityset['wishlist_log']['session_id'])
        entityset.add_relationship(relationship)
        entityset.add_last_time_indexes()

        # now wishlist_log has newer events for 3 session ids
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00")
        true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11")

        assert len(sessions.last_time_index) == 7
        sorted_lti = sessions.last_time_index.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
예제 #4
0
    def test_multiple_children_all_combined(self, entityset, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test some instances in right, some in left, all when combined
        sessions = entityset['sessions']

        # add row to sessions so not all session instances are in log
        sessions.update_data(extra_session_df)

        # add row to wishlist_log so extra session has child instance
        row_values = {'session_id': 6,
                      'datetime': pd.Timestamp("2011-04-11 11:11:11"),
                      'product_id': 'toothpaste'}
        row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8))
        df = wishlist_df.append(row)

        # drop instance 4 so wishlist_log does not have session id 3 instance
        df.drop(4, inplace=True)
        entityset.entity_from_dataframe(entity_id="wishlist_log",
                                        dataframe=df,
                                        index='id',
                                        make_index=True,
                                        time_index='datetime')
        relationship = Relationship(entityset['sessions']['id'],
                                    entityset['wishlist_log']['session_id'])
        entityset.add_relationship(relationship)
        entityset.add_last_time_indexes()

        # wishlist has newer events for 2 sessions
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11")

        assert len(sessions.last_time_index) == 7
        sorted_lti = sessions.last_time_index.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
예제 #5
0
    def test_multiple_children(self, es, wishlist_df, true_sessions_lti):
        # test all instances in both children
        if isinstance(es.entities[0].df, dd.DataFrame):
            wishlist_df = dd.from_pandas(wishlist_df, npartitions=2)
        variable_types = {
            'id': ft.variable_types.variable.Index,
            'session_id': ft.variable_types.variable.Numeric,
            'datetime': ft.variable_types.variable.DatetimeTimeIndex,
            'product_id': ft.variable_types.variable.Categorical
        }
        es.entity_from_dataframe(entity_id="wishlist_log",
                                 dataframe=wishlist_df,
                                 index='id',
                                 make_index=True,
                                 time_index='datetime',
                                 variable_types=variable_types)
        relationship = Relationship(es['sessions']['id'],
                                    es['wishlist_log']['session_id'])
        es.add_relationship(relationship)
        es.add_last_time_indexes()
        sessions = es['sessions']
        # wishlist df has more recent events for two session ids
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00")

        assert len(sessions.last_time_index) == 6
        lti = sessions.last_time_index
        if isinstance(lti, dd.Series):
            lti = lti.compute()
        sorted_lti = lti.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
예제 #6
0
def test_add_relationship_errors_on_dtype_mismatch(es):
    log_2_df = es['log'].df.copy()
    log_variable_types = {
        'id': variable_types.Categorical,
        'session_id': variable_types.Id,
        'product_id': variable_types.Id,
        'datetime': variable_types.Datetime,
        'value': variable_types.Numeric,
        'value_2': variable_types.Numeric,
        'latlong': variable_types.LatLong,
        'latlong2': variable_types.LatLong,
        'value_many_nans': variable_types.Numeric,
        'priority_level': variable_types.Ordinal,
        'purchased': variable_types.Boolean,
        'comments': variable_types.Text
    }
    es.entity_from_dataframe(entity_id='log2',
                             dataframe=log_2_df,
                             index='id',
                             variable_types=log_variable_types,
                             time_index='datetime',
                             encoding='utf-8')

    with pytest.raises(ValueError):
        mismatch = Relationship(es[u'régions']['id'], es['log2']['session_id'])
        es.add_relationship(mismatch)
    def test_multiple_children_right_missing(self, es, wishlist_df,
                                             true_sessions_lti):
        # test all instances in left child
        sessions = es['sessions']

        # drop wishlist instance related to id 3 so it's only in log
        wishlist_df.drop(4, inplace=True)
        if isinstance(es.entities[0].df, dd.DataFrame):
            wishlist_df = dd.from_pandas(wishlist_df, npartitions=2)
        if ks and isinstance(es.entities[0].df, ks.DataFrame):
            wishlist_df = ks.from_pandas(wishlist_df)
        variable_types = {
            'id': ft.variable_types.variable.Index,
            'session_id': ft.variable_types.variable.Numeric,
            'datetime': ft.variable_types.variable.DatetimeTimeIndex,
            'product_id': ft.variable_types.variable.Categorical
        }
        es.entity_from_dataframe(entity_id="wishlist_log",
                                 dataframe=wishlist_df,
                                 index='id',
                                 make_index=True,
                                 time_index='datetime',
                                 variable_types=variable_types)
        relationship = Relationship(es['sessions']['id'],
                                    es['wishlist_log']['session_id'])
        es.add_relationship(relationship)
        es.add_last_time_indexes()

        # now only session id 1 has newer event in wishlist_log
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")

        assert len(sessions.last_time_index) == 6
        sorted_lti = to_pandas(sessions.last_time_index).sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
예제 #8
0
    def from_dictionary(cls, arguments, entityset, dependencies,
                        primitives_deserializer):
        base_features = [
            dependencies[name] for name in arguments['base_features']
        ]
        relationship_path = [
            Relationship.from_dictionary(r, entityset)
            for r in arguments['relationship_path']
        ]
        parent_entity = relationship_path[0].parent_entity
        relationship_path = RelationshipPath([(False, r)
                                              for r in relationship_path])

        primitive = primitives_deserializer.deserialize_primitive(
            arguments['primitive'])

        use_previous_data = arguments['use_previous']
        use_previous = use_previous_data and Timedelta.from_dictionary(
            use_previous_data)

        where_name = arguments['where']
        where = where_name and dependencies[where_name]

        return cls(base_features=base_features,
                   parent_entity=parent_entity,
                   primitive=primitive,
                   relationship_path=relationship_path,
                   use_previous=use_previous,
                   where=where,
                   name=arguments['name'])
예제 #9
0
 def from_dictionary(cls, arguments, entityset, dependencies, primitives_deserializer):
     base_feature = dependencies[arguments['base_feature']]
     relationship = Relationship.from_dictionary(arguments['relationship'], entityset)
     child_entity = relationship.child_entity
     return cls(base_feature=base_feature,
                child_entity=child_entity,
                relationship=relationship,
                name=arguments['name'])
예제 #10
0
    def test_last_time_index(self, entityset):
        es = entityset
        es.normalize_entity('log',
                            'values',
                            'value',
                            make_time_index=True,
                            new_entity_time_index="value_time",
                            convert_links_to_integers=True)
        es.add_last_time_indexes()
        assert es["values"].last_time_index is not None
        times = {
            'values': [
                datetime(2011, 4, 10, 10, 41, 0),
                datetime(2011, 4, 10, 10, 40, 1),
                datetime(2011, 4, 9, 10, 30, 12),
                datetime(2011, 4, 9, 10, 30, 18),
                datetime(2011, 4, 9, 10, 30, 24),
                datetime(2011, 4, 9, 10, 31, 9),
                datetime(2011, 4, 9, 10, 31, 18),
                datetime(2011, 4, 9, 10, 31, 27),
                datetime(2011, 4, 10, 10, 41, 3),
                datetime(2011, 4, 10, 10, 41, 6),
                datetime(2011, 4, 10, 11, 10, 3),
            ],
            'customers': [
                datetime(2011, 4, 9, 10, 40, 0),
                datetime(2011, 4, 10, 10, 41, 6),
                datetime(2011, 4, 10, 11, 10, 3),
            ]
        }
        region_series = pd.Series(
            {'United States': datetime(2011, 4, 10, 11, 10, 3)})
        values_lti = es["values"].last_time_index.sort_index()
        customers_lti = es["customers"].last_time_index.sort_index()
        regions_lti = es["regions"].last_time_index.sort_index()
        assert (values_lti == pd.Series(times['values'])).all()
        assert (customers_lti == pd.Series(times['customers'])).all()
        assert (regions_lti == region_series).all()

        # add promotions entity
        promotions_df = pd.DataFrame({
            "start_date": [datetime(2011, 4, 10, 11, 12, 6)],
            "store_id": [4],
            "product_id": ['coke zero']
        })
        es.entity_from_dataframe(entity_id="promotions",
                                 dataframe=promotions_df,
                                 index='id',
                                 make_index=True,
                                 time_index='start_date')
        relationship = Relationship(es['stores']['id'],
                                    es['promotions']['store_id'])
        es.add_relationship(relationship)
        es.add_last_time_indexes()
        region_series['Mexico'] = datetime(2011, 4, 10, 11, 12, 6)
        regions_lti = es["regions"].last_time_index.sort_index()
        assert (regions_lti == region_series.sort_index()).all()
예제 #11
0
 def from_dictionary(cls, arguments, entityset, dependencies,
                     primitives_deserializer):
     base_feature = dependencies[arguments['base_feature']]
     relationship_path = [
         Relationship.from_dictionary(r, entityset)
         for r in arguments['relationship_path']
     ]
     child_entity = relationship_path[0].child_entity
     return cls(base_feature,
                child_entity,
                relationship_path=relationship_path)
예제 #12
0
def main():
    holding = Holding(
        'HDFCBANK', Price('buy', 97.89), 5, 112.1, 95.4
    )
    print(holding.to_json())

    
    es = EntitySet('user').entity_from_dataframe(
        entity_id='parent', dataframe=pd.DataFrame({
            'id': [10, 14, 24, 34, 54, 64, 84, ],
            'age': [10, 14, 24, 34, 54, 64, 84, ],
            'salary': [1, 1, 10000, 25000, 50000, 60000, 80000, ]
        }))
    es.entity_from_dataframe(
        entity_id='child',  dataframe=pd.DataFrame({
            'id': [10, 14, 24, 34, 54, 64, 84, ],
            'user_id': [10, 14, 24, 34, 54, 64, 84, ],
            'sex': [0, 1, 0, 1, 1, 0, 1]
        }))
    es.add_relationship(
        Relationship(
            es['parent']['id'],
            es['child']['user_id']
        )
    )
    es.entity_from_dataframe(
        entity_id='child_0',  dataframe=pd.DataFrame({
            'id': [10, 14, 24, 34, 54, 64, 84, ],
            'user_id': [10, 14, 24, 34, 54, 64, 84, ],
            'diet': [0, 1, 0, 1, 1, 0, 1]
        }))
    es.add_relationship(
        Relationship(
            es['parent']['id'],
            es['child_0']['user_id']
        )
    )
    # print(es)
    e_s = CustomEntity(entity_set=es)
    print(e_s.to_json())
예제 #13
0
    def test_multiple_children_all_combined(self, es, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test some instances in right, some in left, all when combined
        sessions = es['sessions']

        # add row to sessions so not all session instances are in log
        sessions.update_data(extra_session_df)

        # add row to wishlist_log so extra session has child instance
        row_values = {
            'session_id': 6,
            'datetime': pd.Timestamp("2011-04-11 11:11:11"),
            'product_id': 'toothpaste'
        }
        row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8))
        df = wishlist_df.append(row)

        # drop instance 4 so wishlist_log does not have session id 3 instance
        df.drop(4, inplace=True)
        if isinstance(es.entities[0].df, dd.DataFrame):
            df = dd.from_pandas(df, npartitions=2)
        variable_types = {
            'id': ft.variable_types.variable.Index,
            'session_id': ft.variable_types.variable.Numeric,
            'datetime': ft.variable_types.variable.DatetimeTimeIndex,
            'product_id': ft.variable_types.variable.Categorical
        }
        es.entity_from_dataframe(entity_id="wishlist_log",
                                 dataframe=df,
                                 index='id',
                                 make_index=True,
                                 time_index='datetime',
                                 variable_types=variable_types)
        relationship = Relationship(es['sessions']['id'],
                                    es['wishlist_log']['session_id'])
        es.add_relationship(relationship)
        es.add_last_time_indexes()

        # wishlist has newer events for 2 sessions
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11")

        assert len(sessions.last_time_index) == 7
        lti = sessions.last_time_index
        if isinstance(lti, dd.Series):
            lti = lti.compute()
        sorted_lti = lti.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
    def test_multiple_children_left_missing(self, es, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test all instances in right child
        sessions = es['sessions']

        # add row to sessions so not all session instances are in log
        sessions.update_data(extra_session_df)

        # add row to wishlist df so new session instance in in wishlist_log
        row_values = {
            'session_id': 6,
            'datetime': pd.Timestamp("2011-04-11 11:11:11"),
            'product_id': 'toothpaste'
        }
        row = pd.DataFrame(row_values, index=pd.RangeIndex(start=7, stop=8))
        df = wishlist_df.append(row)
        if isinstance(es.entities[0].df, dd.DataFrame):
            df = dd.from_pandas(df, npartitions=2)
        if ks and isinstance(es.entities[0].df, ks.DataFrame):
            df = ks.from_pandas(df)
        variable_types = {
            'id': ft.variable_types.variable.Index,
            'session_id': ft.variable_types.variable.Numeric,
            'datetime': ft.variable_types.variable.DatetimeTimeIndex,
            'product_id': ft.variable_types.variable.Categorical
        }
        es.entity_from_dataframe(entity_id="wishlist_log",
                                 dataframe=df,
                                 index='id',
                                 make_index=True,
                                 time_index='datetime',
                                 variable_types=variable_types)
        relationship = Relationship(es['sessions']['id'],
                                    es['wishlist_log']['session_id'])
        es.add_relationship(relationship)
        es.add_last_time_indexes()

        # now wishlist_log has newer events for 3 session ids
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00")
        true_sessions_lti[6] = pd.Timestamp("2011-04-11 11:11:11")

        assert len(sessions.last_time_index) == 7
        sorted_lti = to_pandas(sessions.last_time_index).sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
예제 #15
0
    def test_multiple_children(self, es, wishlist_df, true_sessions_lti):
        # test all instances in both children
        es.entity_from_dataframe(entity_id="wishlist_log",
                                 dataframe=wishlist_df,
                                 index='id',
                                 make_index=True,
                                 time_index='datetime')
        relationship = Relationship(es['sessions']['id'],
                                    es['wishlist_log']['session_id'])
        es.add_relationship(relationship)
        es.add_last_time_indexes()
        sessions = es['sessions']
        # wishlist df has more recent events for two session ids
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00")

        assert len(sessions.last_time_index) == 6
        sorted_lti = sessions.last_time_index.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
예제 #16
0
    def test_multiple_children_both_missing(self, es, extra_session_df,
                                            wishlist_df, true_sessions_lti):
        # test all instances in neither child
        sessions = es['sessions']

        if isinstance(es.entities[0].df, dd.DataFrame):
            wishlist_df = dd.from_pandas(wishlist_df, npartitions=2)

        variable_types = {
            'id': ft.variable_types.variable.Index,
            'session_id': ft.variable_types.variable.Numeric,
            'datetime': ft.variable_types.variable.DatetimeTimeIndex,
            'product_id': ft.variable_types.variable.Categorical
        }
        # add row to sessions to create session with no events
        sessions.update_data(extra_session_df)

        es.entity_from_dataframe(entity_id="wishlist_log",
                                 dataframe=wishlist_df,
                                 index='id',
                                 make_index=True,
                                 time_index='datetime',
                                 variable_types=variable_types)
        relationship = Relationship(es['sessions']['id'],
                                    es['wishlist_log']['session_id'])
        es.add_relationship(relationship)
        es.add_last_time_indexes()
        sessions = es['sessions']

        # wishlist has 2 newer events and one is NaT
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")
        true_sessions_lti[3] = pd.Timestamp("2011-4-10 10:41:00")
        true_sessions_lti[6] = pd.NaT

        assert len(sessions.last_time_index) == 7
        lti = sessions.last_time_index
        if isinstance(lti, dd.Series):
            lti = lti.compute()
        sorted_lti = lti.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
예제 #17
0
    def test_multiple_children_right_missing(self, entityset, wishlist_df,
                                             true_sessions_lti):
        # test all instances in left child
        sessions = entityset['sessions']

        # drop wishlist instance related to id 3 so it's only in log
        wishlist_df.drop(4, inplace=True)
        entityset.entity_from_dataframe(entity_id="wishlist_log",
                                        dataframe=wishlist_df,
                                        index='id',
                                        make_index=True,
                                        time_index='datetime')
        relationship = Relationship(entityset['sessions']['id'],
                                    entityset['wishlist_log']['session_id'])
        entityset.add_relationship(relationship)
        entityset.add_last_time_indexes()

        # now only session id 1 has newer event in wishlist_log
        true_sessions_lti[1] = pd.Timestamp("2011-4-9 10:31:30")

        assert len(sessions.last_time_index) == 6
        sorted_lti = sessions.last_time_index.sort_index()
        for v1, v2 in zip(sorted_lti, true_sessions_lti):
            assert (pd.isnull(v1) and pd.isnull(v2)) or v1 == v2
예제 #18
0
        assert (values_lti == pd.Series(times['values'])).all()
        assert (customers_lti == pd.Series(times['customers'])).all()
        assert (regions_lti == region_series).all()

        # add promotions entity
        promotions_df = pd.DataFrame({
            "start_date": [datetime(2011, 4, 10, 11, 12, 06)],
            "store_id": [4],
            "product_id": ['coke zero']
        })
        es.entity_from_dataframe(entity_id="promotions",
                                 dataframe=promotions_df,
                                 index='id',
                                 make_index=True,
                                 time_index='start_date')
        relationship = Relationship(es['stores']['id'],
                                    es['promotions']['store_id'])
        es.add_relationship(relationship)
        es.add_last_time_indexes()
        region_series['Mexico'] = datetime(2011, 4, 10, 11, 12, 06)
        regions_lti = es["regions"].last_time_index.sort_index()
        assert (regions_lti == region_series.sort_index()).all()


def test_head_of_entity(entityset):

    entity = entityset['log']
    assert(isinstance(entityset.head('log', 3), pd.DataFrame))
    assert(isinstance(entity.head(3), pd.DataFrame))
    assert(isinstance(entity['product_id'].head(3), pd.DataFrame))

    assert(entity.head(n=5).shape == (5, 9))
예제 #19
0
def test_add_parent_not_index_varible(es):
    with pytest.raises(AttributeError):
        es.add_relationship(
            Relationship(es['regions']['language'],
                         es['customers']['region_id']))
예제 #20
0
def test_add_parent_not_index_variable(es):
    error_text = "Parent variable.*is not the index of entity Entity.*"
    with pytest.raises(AttributeError, match=error_text):
        es.add_relationship(
            Relationship(es[u'régions']['language'],
                         es['customers'][u'région_id']))