Пример #1
0
 def _make_entity_set(self, X):
     """Helper method that creates and returns the entity set given the input data"""
     ft_es = EntitySet()
     if self.index not in X.columns:
         es = ft_es.entity_from_dataframe(entity_id="X", dataframe=X, index=self.index, make_index=True)
     else:
         es = ft_es.entity_from_dataframe(entity_id="X", dataframe=X, index=self.index)
     return es
Пример #2
0
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']})

    es = EntitySet('test')
    es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
    features, feature_defs = dfs(entityset=es, target_entity='a')

    # Specify unknown token for replacement
    features_enc, feature_defs_enc = encode_features(features, feature_defs,
                                                     include_unknown=True)
    assert list(features_enc.columns) == ['category = unknown', 'category = e', 'category = d',
                                          'category = c', 'category = b', 'category is unknown']
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']})

    es = EntitySet('test')
    es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
    features, feature_defs = dfs(entityset=es, target_entity='a')

    # Specify unknown token for replacement
    features_enc, feature_defs_enc = encode_features(features, feature_defs,
                                                     include_unknown=True)
    assert list(features_enc.columns) == ['category = unknown', 'category = e', 'category = d',
                                          'category = c', 'category = b', 'category is unknown']
Пример #4
0
def test_encode_features_drop_first():
    df = pd.DataFrame({'category': ['ao', 'b', 'c', 'd', 'e']})
    es = EntitySet('test')
    es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True)
    features, feature_defs = dfs(entityset=es, target_entity='a')
    features_enc, feature_defs_enc = encode_features(features, feature_defs,
                                                     drop_first=True, include_unknown=False)
    assert len(features_enc.columns) == 4

    features_enc, feature_defs = encode_features(features, feature_defs, top_n=3, drop_first=True,
                                                 include_unknown=False)

    assert len(features_enc.columns) == 2
def test_cfm_approximate_correct_ordering():
    trips = {
        'trip_id': [i for i in range(1000)],
        'flight_time': [datetime(1998, 4, 2) for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)],
        'flight_id': [randint(1, 25) for i in range(1000)],
        'trip_duration': [randint(1, 999) for i in range(1000)]
    }
    df = pd.DataFrame.from_dict(trips)
    es = EntitySet('flights')
    es.entity_from_dataframe("trips",
                             dataframe=df,
                             index="trip_id",
                             time_index='flight_time')
    es.normalize_entity(base_entity_id="trips",
                        new_entity_id="flights",
                        index="flight_id",
                        make_time_index=True)
    features = dfs(entityset=es, target_entity='trips', features_only=True)
    flight_features = [feature for feature in features
                       if isinstance(feature, DirectFeature) and
                       isinstance(feature.base_features[0],
                                  AggregationPrimitive)]
    property_feature = IdentityFeature(es['trips']['trip_id'])
    # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'],
    #                                     es['flights']),
    #                                 es['trips'])
    cutoff_time = pd.DataFrame.from_dict({'instance_id': df['trip_id'],
                                          'time': df['flight_time']})
    time_feature = IdentityFeature(es['trips']['flight_time'])
    feature_matrix = calculate_feature_matrix(flight_features + [property_feature, time_feature],
                                              cutoff_time_in_index=True,
                                              cutoff_time=cutoff_time)
    feature_matrix.index.names = ['instance', 'time']
    assert(np.all(feature_matrix.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix[['trip_id', 'flight_time']].values))
    feature_matrix_2 = calculate_feature_matrix(flight_features + [property_feature, time_feature],
                                                cutoff_time=cutoff_time,
                                                cutoff_time_in_index=True,
                                                approximate=Timedelta(2, 'd'))
    feature_matrix_2.index.names = ['instance', 'time']
    assert(np.all(feature_matrix_2.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix_2[['trip_id', 'flight_time']].values))
    for column in feature_matrix:
        for x, y in zip(feature_matrix[column], feature_matrix_2[column]):
            if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)):
                import pdb
                pdb.set_trace()
            assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
Пример #6
0
def test_serialization(es):
    dirname = os.path.dirname(integration_data.__file__)
    path = os.path.join(dirname, 'test_entityset.p')
    if os.path.exists(path):
        shutil.rmtree(path)
    es.to_pickle(path)
    new_es = EntitySet.read_pickle(path)
    assert es.__eq__(new_es, deep=True)
    shutil.rmtree(path)
Пример #7
0
def test_raise_key_error_missing_entity(es):
    error_text = "Entity this entity doesn't exist does not exist in ecommerce"
    with pytest.raises(KeyError, match=error_text):
        es["this entity doesn't exist"]

    es_without_id = EntitySet()
    error_text = "Entity this entity doesn't exist does not exist in entity set"
    with pytest.raises(KeyError, match=error_text):
        es_without_id["this entity doesn't exist"]
Пример #8
0
def test_serialization(es):
    dirname = os.path.dirname(integration_data.__file__)
    path = os.path.join(dirname, 'test_entityset.p')
    if os.path.exists(path):
        shutil.rmtree(path)
    es.to_pickle(path)
    new_es = EntitySet.read_pickle(path)
    assert es.__eq__(new_es, deep=True)
    shutil.rmtree(path)
Пример #9
0
def test_raise_key_error_missing_dataframe(es):
    error_text = "DataFrame testing does not exist in ecommerce"
    with pytest.raises(KeyError, match=error_text):
        es["testing"]

    es_without_id = EntitySet()
    error_text = "DataFrame testing does not exist in entity set"
    with pytest.raises(KeyError, match=error_text):
        es_without_id["testing"]
Пример #10
0
def test_encode_features_drop_first():
    df = pd.DataFrame({"category": ["ao", "b", "c", "d", "e"]}).astype(
        {"category": "category"}
    )
    pd_es = EntitySet("test")
    pd_es.add_dataframe(
        dataframe_name="a", dataframe=df, index="index", make_index=True
    )
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a")
    features_enc, _ = encode_features(
        features, feature_defs, drop_first=True, include_unknown=False
    )
    assert len(features_enc.columns) == 4

    features_enc, feature_defs = encode_features(
        features, feature_defs, top_n=3, drop_first=True, include_unknown=False
    )

    assert len(features_enc.columns) == 2
Пример #11
0
def test_encode_features_matches_calculate_feature_matrix():
    df = pd.DataFrame({"category": ["b", "c", "d", "e"]}).astype(
        {"category": "category"}
    )

    pd_es = EntitySet("test")
    pd_es.add_dataframe(
        dataframe_name="a", dataframe=df, index="index", make_index=True
    )
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a")

    features_enc, feature_defs_enc = encode_features(
        features, feature_defs, to_encode=["category"]
    )

    features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es)

    pd.testing.assert_frame_equal(features_enc, features_calc)
    assert features_calc.ww._schema == features_enc.ww._schema
Пример #12
0
def test_encode_features_matches_calculate_feature_matrix():
    df = pd.DataFrame({'category': ['b', 'c', 'd', 'e']})

    pd_es = EntitySet('test')
    pd_es.entity_from_dataframe(entity_id='a',
                                dataframe=df,
                                index='index',
                                make_index=True)
    features, feature_defs = dfs(entityset=pd_es, target_entity='a')

    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     to_encode=['category'])

    features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es)

    assert features_enc['category = e'].dtypes == bool
    assert features_enc['category = e'].dtypes == features_calc[
        'category = e'].dtypes
Пример #13
0
def test_encode_features_matches_calculate_feature_matrix():
    df = pd.DataFrame({
        'category': ['b', 'c', 'd', 'e']
    }).astype({'category': 'category'})

    pd_es = EntitySet('test')
    pd_es.add_dataframe(dataframe_name='a',
                        dataframe=df,
                        index='index',
                        make_index=True)
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a')

    features_enc, feature_defs_enc = encode_features(features,
                                                     feature_defs,
                                                     to_encode=['category'])

    features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es)

    pd.testing.assert_frame_equal(features_enc, features_calc)
    assert features_calc.ww._schema == features_enc.ww._schema
Пример #14
0
def main():
    holding = Holding(
        'HDFCBANK', Price('buy', 97.89), 5, 112.1, 95.4
    )
    print(holding.to_json())

    
    es = EntitySet('user').entity_from_dataframe(
        entity_id='parent', dataframe=pd.DataFrame({
            'id': [10, 14, 24, 34, 54, 64, 84, ],
            'age': [10, 14, 24, 34, 54, 64, 84, ],
            'salary': [1, 1, 10000, 25000, 50000, 60000, 80000, ]
        }))
    es.entity_from_dataframe(
        entity_id='child',  dataframe=pd.DataFrame({
            'id': [10, 14, 24, 34, 54, 64, 84, ],
            'user_id': [10, 14, 24, 34, 54, 64, 84, ],
            'sex': [0, 1, 0, 1, 1, 0, 1]
        }))
    es.add_relationship(
        Relationship(
            es['parent']['id'],
            es['child']['user_id']
        )
    )
    es.entity_from_dataframe(
        entity_id='child_0',  dataframe=pd.DataFrame({
            'id': [10, 14, 24, 34, 54, 64, 84, ],
            'user_id': [10, 14, 24, 34, 54, 64, 84, ],
            'diet': [0, 1, 0, 1, 1, 0, 1]
        }))
    es.add_relationship(
        Relationship(
            es['parent']['id'],
            es['child_0']['user_id']
        )
    )
    # print(es)
    e_s = CustomEntity(entity_set=es)
    print(e_s.to_json())
Пример #15
0
def test_encode_unknown_features():
    # Dataframe with categorical column with "unknown" string
    df = pd.DataFrame({"category": ["unknown", "b", "c", "d", "e"]}).astype(
        {"category": "category"}
    )

    pd_es = EntitySet("test")
    pd_es.add_dataframe(
        dataframe_name="a", dataframe=df, index="index", make_index=True
    )
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a")

    # Specify unknown token for replacement
    features_enc, _ = encode_features(features, feature_defs, include_unknown=True)
    assert list(features_enc.columns) == [
        "category = unknown",
        "category = e",
        "category = d",
        "category = c",
        "category = b",
        "category is unknown",
    ]
def test_cfm_approximate_correct_ordering():
    trips = {
        'trip_id': [i for i in range(1000)],
        'flight_time':
        [datetime(1998, 4, 2)
         for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)],
        'flight_id': [randint(1, 25) for i in range(1000)],
        'trip_duration': [randint(1, 999) for i in range(1000)]
    }
    df = pd.DataFrame.from_dict(trips)
    es = EntitySet('flights')
    es.entity_from_dataframe("trips",
                             dataframe=df,
                             index="trip_id",
                             time_index='flight_time')
    es.normalize_entity(base_entity_id="trips",
                        new_entity_id="flights",
                        index="flight_id",
                        make_time_index=True)
    features = dfs(entityset=es, target_entity='trips', features_only=True)
    flight_features = [
        feature for feature in features if isinstance(feature, DirectFeature)
        and isinstance(feature.base_features[0], AggregationPrimitive)
    ]
    property_feature = IdentityFeature(es['trips']['trip_id'])
    # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'],
    #                                     es['flights']),
    #                                 es['trips'])
    cutoff_time = pd.DataFrame.from_dict({
        'instance_id': df['trip_id'],
        'time': df['flight_time']
    })
    time_feature = IdentityFeature(es['trips']['flight_time'])
    feature_matrix = calculate_feature_matrix(flight_features +
                                              [property_feature, time_feature],
                                              es,
                                              cutoff_time_in_index=True,
                                              cutoff_time=cutoff_time)
    feature_matrix.index.names = ['instance', 'time']
    assert (np.all(
        feature_matrix.reset_index('time').reset_index()[['instance', 'time']].
        values == feature_matrix[['trip_id', 'flight_time']].values))
    feature_matrix_2 = calculate_feature_matrix(
        flight_features + [property_feature, time_feature],
        es,
        cutoff_time=cutoff_time,
        cutoff_time_in_index=True,
        approximate=Timedelta(2, 'd'))
    feature_matrix_2.index.names = ['instance', 'time']
    assert (np.all(
        feature_matrix_2.reset_index('time').reset_index()[[
            'instance', 'time'
        ]].values == feature_matrix_2[['trip_id', 'flight_time']].values))
    for column in feature_matrix:
        for x, y in zip(feature_matrix[column], feature_matrix_2[column]):
            if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)):
                import pdb
                pdb.set_trace()
            assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
Пример #17
0
def test_encode_features_drop_first():
    df = pd.DataFrame({
        'category': ['ao', 'b', 'c', 'd', 'e']
    }).astype({'category': 'category'})
    pd_es = EntitySet('test')
    pd_es.add_dataframe(dataframe_name='a',
                        dataframe=df,
                        index='index',
                        make_index=True)
    features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a')
    features_enc, _ = encode_features(features,
                                      feature_defs,
                                      drop_first=True,
                                      include_unknown=False)
    assert len(features_enc.columns) == 4

    features_enc, feature_defs = encode_features(features,
                                                 feature_defs,
                                                 top_n=3,
                                                 drop_first=True,
                                                 include_unknown=False)

    assert len(features_enc.columns) == 2