def _make_entity_set(self, X): """Helper method that creates and returns the entity set given the input data""" ft_es = EntitySet() if self.index not in X.columns: es = ft_es.entity_from_dataframe(entity_id="X", dataframe=X, index=self.index, make_index=True) else: es = ft_es.entity_from_dataframe(entity_id="X", dataframe=X, index=self.index) return es
def test_encode_unknown_features(): # Dataframe with categorical column with "unknown" string df = pd.DataFrame({'category': ['unknown', 'b', 'c', 'd', 'e']}) es = EntitySet('test') es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=es, target_entity='a') # Specify unknown token for replacement features_enc, feature_defs_enc = encode_features(features, feature_defs, include_unknown=True) assert list(features_enc.columns) == ['category = unknown', 'category = e', 'category = d', 'category = c', 'category = b', 'category is unknown']
def test_encode_features_drop_first(): df = pd.DataFrame({'category': ['ao', 'b', 'c', 'd', 'e']}) es = EntitySet('test') es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=es, target_entity='a') features_enc, feature_defs_enc = encode_features(features, feature_defs, drop_first=True, include_unknown=False) assert len(features_enc.columns) == 4 features_enc, feature_defs = encode_features(features, feature_defs, top_n=3, drop_first=True, include_unknown=False) assert len(features_enc.columns) == 2
def test_cfm_approximate_correct_ordering(): trips = { 'trip_id': [i for i in range(1000)], 'flight_time': [datetime(1998, 4, 2) for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)], 'flight_id': [randint(1, 25) for i in range(1000)], 'trip_duration': [randint(1, 999) for i in range(1000)] } df = pd.DataFrame.from_dict(trips) es = EntitySet('flights') es.entity_from_dataframe("trips", dataframe=df, index="trip_id", time_index='flight_time') es.normalize_entity(base_entity_id="trips", new_entity_id="flights", index="flight_id", make_time_index=True) features = dfs(entityset=es, target_entity='trips', features_only=True) flight_features = [feature for feature in features if isinstance(feature, DirectFeature) and isinstance(feature.base_features[0], AggregationPrimitive)] property_feature = IdentityFeature(es['trips']['trip_id']) # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'], # es['flights']), # es['trips']) cutoff_time = pd.DataFrame.from_dict({'instance_id': df['trip_id'], 'time': df['flight_time']}) time_feature = IdentityFeature(es['trips']['flight_time']) feature_matrix = calculate_feature_matrix(flight_features + [property_feature, time_feature], cutoff_time_in_index=True, cutoff_time=cutoff_time) feature_matrix.index.names = ['instance', 'time'] assert(np.all(feature_matrix.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix[['trip_id', 'flight_time']].values)) feature_matrix_2 = calculate_feature_matrix(flight_features + [property_feature, time_feature], cutoff_time=cutoff_time, cutoff_time_in_index=True, approximate=Timedelta(2, 'd')) feature_matrix_2.index.names = ['instance', 'time'] assert(np.all(feature_matrix_2.reset_index('time').reset_index()[['instance', 'time']].values == feature_matrix_2[['trip_id', 'flight_time']].values)) for column in feature_matrix: for x, y in zip(feature_matrix[column], feature_matrix_2[column]): if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)): import pdb pdb.set_trace() assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_serialization(es): dirname = os.path.dirname(integration_data.__file__) path = os.path.join(dirname, 'test_entityset.p') if os.path.exists(path): shutil.rmtree(path) es.to_pickle(path) new_es = EntitySet.read_pickle(path) assert es.__eq__(new_es, deep=True) shutil.rmtree(path)
def test_raise_key_error_missing_entity(es): error_text = "Entity this entity doesn't exist does not exist in ecommerce" with pytest.raises(KeyError, match=error_text): es["this entity doesn't exist"] es_without_id = EntitySet() error_text = "Entity this entity doesn't exist does not exist in entity set" with pytest.raises(KeyError, match=error_text): es_without_id["this entity doesn't exist"]
def test_raise_key_error_missing_dataframe(es): error_text = "DataFrame testing does not exist in ecommerce" with pytest.raises(KeyError, match=error_text): es["testing"] es_without_id = EntitySet() error_text = "DataFrame testing does not exist in entity set" with pytest.raises(KeyError, match=error_text): es_without_id["testing"]
def test_encode_features_drop_first(): df = pd.DataFrame({"category": ["ao", "b", "c", "d", "e"]}).astype( {"category": "category"} ) pd_es = EntitySet("test") pd_es.add_dataframe( dataframe_name="a", dataframe=df, index="index", make_index=True ) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a") features_enc, _ = encode_features( features, feature_defs, drop_first=True, include_unknown=False ) assert len(features_enc.columns) == 4 features_enc, feature_defs = encode_features( features, feature_defs, top_n=3, drop_first=True, include_unknown=False ) assert len(features_enc.columns) == 2
def test_encode_features_matches_calculate_feature_matrix(): df = pd.DataFrame({"category": ["b", "c", "d", "e"]}).astype( {"category": "category"} ) pd_es = EntitySet("test") pd_es.add_dataframe( dataframe_name="a", dataframe=df, index="index", make_index=True ) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a") features_enc, feature_defs_enc = encode_features( features, feature_defs, to_encode=["category"] ) features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es) pd.testing.assert_frame_equal(features_enc, features_calc) assert features_calc.ww._schema == features_enc.ww._schema
def test_encode_features_matches_calculate_feature_matrix(): df = pd.DataFrame({'category': ['b', 'c', 'd', 'e']}) pd_es = EntitySet('test') pd_es.entity_from_dataframe(entity_id='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=pd_es, target_entity='a') features_enc, feature_defs_enc = encode_features(features, feature_defs, to_encode=['category']) features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es) assert features_enc['category = e'].dtypes == bool assert features_enc['category = e'].dtypes == features_calc[ 'category = e'].dtypes
def test_encode_features_matches_calculate_feature_matrix(): df = pd.DataFrame({ 'category': ['b', 'c', 'd', 'e'] }).astype({'category': 'category'}) pd_es = EntitySet('test') pd_es.add_dataframe(dataframe_name='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a') features_enc, feature_defs_enc = encode_features(features, feature_defs, to_encode=['category']) features_calc = calculate_feature_matrix(feature_defs_enc, entityset=pd_es) pd.testing.assert_frame_equal(features_enc, features_calc) assert features_calc.ww._schema == features_enc.ww._schema
def main(): holding = Holding( 'HDFCBANK', Price('buy', 97.89), 5, 112.1, 95.4 ) print(holding.to_json()) es = EntitySet('user').entity_from_dataframe( entity_id='parent', dataframe=pd.DataFrame({ 'id': [10, 14, 24, 34, 54, 64, 84, ], 'age': [10, 14, 24, 34, 54, 64, 84, ], 'salary': [1, 1, 10000, 25000, 50000, 60000, 80000, ] })) es.entity_from_dataframe( entity_id='child', dataframe=pd.DataFrame({ 'id': [10, 14, 24, 34, 54, 64, 84, ], 'user_id': [10, 14, 24, 34, 54, 64, 84, ], 'sex': [0, 1, 0, 1, 1, 0, 1] })) es.add_relationship( Relationship( es['parent']['id'], es['child']['user_id'] ) ) es.entity_from_dataframe( entity_id='child_0', dataframe=pd.DataFrame({ 'id': [10, 14, 24, 34, 54, 64, 84, ], 'user_id': [10, 14, 24, 34, 54, 64, 84, ], 'diet': [0, 1, 0, 1, 1, 0, 1] })) es.add_relationship( Relationship( es['parent']['id'], es['child_0']['user_id'] ) ) # print(es) e_s = CustomEntity(entity_set=es) print(e_s.to_json())
def test_encode_unknown_features(): # Dataframe with categorical column with "unknown" string df = pd.DataFrame({"category": ["unknown", "b", "c", "d", "e"]}).astype( {"category": "category"} ) pd_es = EntitySet("test") pd_es.add_dataframe( dataframe_name="a", dataframe=df, index="index", make_index=True ) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name="a") # Specify unknown token for replacement features_enc, _ = encode_features(features, feature_defs, include_unknown=True) assert list(features_enc.columns) == [ "category = unknown", "category = e", "category = d", "category = c", "category = b", "category is unknown", ]
def test_cfm_approximate_correct_ordering(): trips = { 'trip_id': [i for i in range(1000)], 'flight_time': [datetime(1998, 4, 2) for i in range(350)] + [datetime(1997, 4, 3) for i in range(650)], 'flight_id': [randint(1, 25) for i in range(1000)], 'trip_duration': [randint(1, 999) for i in range(1000)] } df = pd.DataFrame.from_dict(trips) es = EntitySet('flights') es.entity_from_dataframe("trips", dataframe=df, index="trip_id", time_index='flight_time') es.normalize_entity(base_entity_id="trips", new_entity_id="flights", index="flight_id", make_time_index=True) features = dfs(entityset=es, target_entity='trips', features_only=True) flight_features = [ feature for feature in features if isinstance(feature, DirectFeature) and isinstance(feature.base_features[0], AggregationPrimitive) ] property_feature = IdentityFeature(es['trips']['trip_id']) # direct_agg_feat = DirectFeature(Sum(es['trips']['trip_duration'], # es['flights']), # es['trips']) cutoff_time = pd.DataFrame.from_dict({ 'instance_id': df['trip_id'], 'time': df['flight_time'] }) time_feature = IdentityFeature(es['trips']['flight_time']) feature_matrix = calculate_feature_matrix(flight_features + [property_feature, time_feature], es, cutoff_time_in_index=True, cutoff_time=cutoff_time) feature_matrix.index.names = ['instance', 'time'] assert (np.all( feature_matrix.reset_index('time').reset_index()[['instance', 'time']]. values == feature_matrix[['trip_id', 'flight_time']].values)) feature_matrix_2 = calculate_feature_matrix( flight_features + [property_feature, time_feature], es, cutoff_time=cutoff_time, cutoff_time_in_index=True, approximate=Timedelta(2, 'd')) feature_matrix_2.index.names = ['instance', 'time'] assert (np.all( feature_matrix_2.reset_index('time').reset_index()[[ 'instance', 'time' ]].values == feature_matrix_2[['trip_id', 'flight_time']].values)) for column in feature_matrix: for x, y in zip(feature_matrix[column], feature_matrix_2[column]): if not ((pd.isnull(x) and pd.isnull(y)) or (x == y)): import pdb pdb.set_trace() assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
def test_encode_features_drop_first(): df = pd.DataFrame({ 'category': ['ao', 'b', 'c', 'd', 'e'] }).astype({'category': 'category'}) pd_es = EntitySet('test') pd_es.add_dataframe(dataframe_name='a', dataframe=df, index='index', make_index=True) features, feature_defs = dfs(entityset=pd_es, target_dataframe_name='a') features_enc, _ = encode_features(features, feature_defs, drop_first=True, include_unknown=False) assert len(features_enc.columns) == 4 features_enc, feature_defs = encode_features(features, feature_defs, top_n=3, drop_first=True, include_unknown=False) assert len(features_enc.columns) == 2