Exemplo n.º 1
0
    def test1_default_boolean(self):

        input_df = pd.read_csv("test/data/fusion/input_df_test1.csv")
        input_df = input_df.astype({'new_link_in_boolean_http://www.wikidata.org/entity/Q4587626': bool})
        input_matches = pd.read_csv("test/data/fusion/input_matches_test1.csv")
        clusters = get_fusion_clusters(input_matches, threshold=0.85)

        df_expected = pd.read_csv("test/data/fusion/fused_expected_test1.csv")

        output_fused = data_fuser(input_df, clusters)

        pd.testing.assert_frame_equal(
            output_fused, df_expected)
Exemplo n.º 2
0
    def test3_default_string(self):

            input_df = pd.read_csv("test/data/fusion/input_df_test3.csv")
            clusters = [set(['http://a', 'http://b', 'http://c']),
                        set(['http://d', 'http://e']),
                        set(['http://f', 'http://g', 'http://h'])]

            df_expected = pd.read_csv("test/data/fusion/fused_expected_test3.csv")

            output_fused = data_fuser(input_df, clusters)

            pd.testing.assert_frame_equal(
                output_fused, df_expected, check_like=True)
Exemplo n.º 3
0
    def test4_callable_function(self):

            input_df = pd.read_csv("test/data/fusion/input_df_test3.csv")
            clusters = [set(['http://a', 'http://b', 'http://c']),
                        set(['http://d', 'http://e']),
                        set(['http://f', 'http://g', 'http://h'])]

            df_expected = pd.read_csv("test/data/fusion/fused_expected_test4.csv")

            def own_function(x):
                x = x.dropna()
                if x.empty:
                    return np.nan
                else:
                    return x.sum()

            output_fused = data_fuser(input_df, clusters, string_method_multiple=own_function)

            pd.testing.assert_frame_equal(
                output_fused, df_expected, check_like=True)
 def transform(self, X, y=None):
     #TODO: Add progress & caching attribute to matching_function call?
     result_dfs = [func(X) for func in self.matching_functions]
     combined = matching_combiner(
         matching_result_dfs=result_dfs,
         method=self.method,
         columns=self.columns,
         ignore_single_missings=self.ignore_single_missings,
         weights=self.weights,
         merge_on=self.merge_on)
     clusters = get_fusion_clusters(combined,
                                    self.threshold,
                                    progress=self.progress)
     X = data_fuser(X,
                    clusters,
                    self.boolean_method_single,
                    self.boolean_method_multiple,
                    self.numeric_method_single,
                    self.numeric_method_multiple,
                    self.string_method_single,
                    self.string_method_multiple,
                    self.provenance_regex,
                    progress=self.progress)
     return X