def test2_no_pruning_correlation(self): df = pd.DataFrame({ 'entities': ['Paris', 'Buenos Aires', 'Mannheim', "München"], 'link': ['http://dbpedia.org/resource/Paris', 'http://dbpedia.org/resource/Buenos_Aires', 'http://dbpedia.org/resource/Mannheim', 'http://dbpedia.org/resource/Munich'] }) expected_df = pd.read_csv("test\data\feature_selection\hierarchy_based_test2_expected.csv") input_df = direct_type_generator(df, ["link"], regex_filter=['A'], result_type="boolean", bundled_mode=True, hierarchy=True) output_df = hierarchy_based_filter(input_df, "link", threshold=0.99, G=input_DG, metric="correlation", pruning=False) pd.testing.assert_frame_equal(output_df, expected_df, check_like=True)
def test8_nan(self): input_df = pd.read_csv("test/data/feature_selection/hill_climbing_test3_input.csv") input_DG = nx.DiGraph() labels = ['http://chancellor', 'http://president', 'http://European_politician', 'http://head_of_state', 'http://politician', 'http://man', 'http://person', 'http://being'] input_DG.add_nodes_from(labels) input_DG.add_edges_from([('http://chancellor', 'http://politician'), ('http://president', 'http://politician'), ('http://chancellor', 'http://head_of_state'), ('http://president', 'http://head_of_state'), ('http://head_of_state', 'http://person'), ('http://European_politician', 'http://politician'), ('http://politician', 'http://person'), ('http://man', 'http://person'), ('http://person', 'http://being')]) expected_df = pd.read_csv("test/data/feature_selection/hierarchy_based_test8_expected.csv") output_df = hierarchy_based_filter(input_df, 'class', G=input_DG, threshold=0.99, metric="info_gain", pruning=True) pd.testing.assert_frame_equal(output_df, expected_df, check_like=True)
def test9_callable_function(self): input_df = pd.read_csv("test/data/feature_selection/hill_climbing_test1_input.csv") input_DG = nx.DiGraph() labels = ['http://chancellor', 'http://president', 'http://European_politician', 'http://head_of_state', 'http://politician', 'http://man', 'http://person', 'http://being'] input_DG.add_nodes_from(labels) input_DG.add_edges_from([('http://chancellor', 'http://politician'), ('http://president', 'http://politician'), ('http://chancellor', 'http://head_of_state'), ('http://president', 'http://head_of_state'), ('http://head_of_state', 'http://person'), ('http://European_politician', 'http://politician'), ('http://politician', 'http://person'), ('http://man', 'http://person'), ('http://person', 'http://being')]) def fake_metric(df_from_hierarchy, l, d): equivalence = df_from_hierarchy[l] == df_from_hierarchy[d] return equivalence.sum()/len(equivalence) expected_df = pd.read_csv("test/data/feature_selection/hierarchy_based_test9_expected.csv") output_df = hierarchy_based_filter(input_df, 'uri_bool_http://class', G= input_DG, threshold=0.99, metric=fake_metric, pruning=True) pd.testing.assert_frame_equal(output_df, expected_df, check_like=True)
def transform(self, X, y=None): X = hierarchy_based_filter(X, self.label_column, self.G, self.threshold, self.metric, self.pruning, self.all_remove, self.progress) return X