def test2_nan(self):
        columns_input = pd.DataFrame({
            np.nan: ['haus', 'usa', 'frauenkirche', 'aua'],
            'http://dbpedia.org//b': ['chicoree', 'usa', 'mantra', 'mantel'],
            'www.test_ernst.de/c': ['haus', True, 'mantra', 'sonnengruss']
        }).columns

        output = provenance(columns_input)

        assert output == 'http://dbpedia.org//b'
    def test3_no_match(self):
        columns_input = pd.DataFrame({
            'www.test_spass.de/a': ['haus', 'usa', 'frauenkirche', 'aua'],
            'http://dbpedia.org//b': ['chicoree', 'usa', 'mantra', 'mantel'],
            'www.test_ernst.de/c': ['haus', True, 'mantra', 'sonnengruss']
        }).columns

        with pytest.raises(RuntimeError) as excinfo:
            _ = provenance(columns_input, regex='bananaboat')

        assert "No column satisfies the regex." in str(excinfo.value)
    def test4_multiple_matches(self):
        columns_input = pd.DataFrame({
            'www.bananaboat.de/a': ['haus', 'usa', 'frauenkirche', 'aua'],
            'http://dbpedia.org//b': ['chicoree', 'usa', 'mantra', 'mantel'],
            'www.test_ernst.bananaboat/c':
            ['haus', True, 'mantra', 'sonnengruss']
        }).columns

        with pytest.raises(RuntimeError) as excinfo:
            _ = provenance(columns_input, regex='bananaboat')

        assert "More than one of the matches" in str(excinfo.value)
Exemplo n.º 4
0
def data_fuser(df,
               clusters,
               boolean_method_single="provenance",
               boolean_method_multiple="voting",
               numeric_method_single="average",
               numeric_method_multiple="average",
               string_method_single="longest",
               string_method_multiple="longest",
               provenance_regex="http://dbpedia.org/",
               progress=True):
    """Fuses the columns in the "match" sets of the clusters. Determines type
    and size and automatically detects which of the functions to use. If a
    fusion match is a pair, the "single" functions is used, otherwise the
    "multiple" function.
    Available functions are first, last, longest, shortest, random.choice,
    voting and provenance. Other existing and user-defined functions can be
    passed as well, they should be applicable to pd.DataFrame.apply(axis=1).

    Args:
        df (pd.DataFrame): The DataFrame where schema matches are to be fused
        clusters (list): contains the clusters with the matching column names
            as sets
        boolean_method_single (str, optional): Method for single matches with
            boolean type. Defaults to "provenance".
        boolean_method_multiple (str, optional): Method for multiple matches 
            with boolean type. Defaults to "voting".
        numeric_method_single (str, optional): mMthod for single matches with
            numeric type. Defaults to "average".
        numeric_method_multiple (str, optional): Method for multiple matches 
            with numeric type. Defaults to "average".
        string_method_single (str, optional): Method for single matches with
            string type. Defaults to "longest".
        string_method_multiple (str, optional): Method for multiple matches with
            string type. Defaults to "longest".
        provenance_regex (str, optional): Pattern after which provenance is
            selected. Defaults to "http://dbpedia.org/".
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.

    Returns:
        pd.DataFrame: DataFrame with fused columns.
    """

    df = df.copy()
    clusters = clusters.copy()

    # assert that no wrong fusion methods are chosen for specific types and
    # group sizes
    assert boolean_method_single != "voting",\
         "Voting will not work for single matches"
    assert numeric_method_single not in [
        "voting", "median"], \
            "Voting and median will not work for single matches"
    assert string_method_single != "voting",\
         "Voting will not work for single matches"

    function_lookup = fusion_function_lookup(
        boolean_method_single, boolean_method_multiple, numeric_method_single,
        numeric_method_multiple, string_method_single, string_method_multiple)

    # reattach full column prefixes to column names in cluster
    cat_cols = [col for col in df.columns if re.findall("http:", col)]
    cat_cols_stripped = [
        re.sub(r"^.*http://", "http://", col) for col in cat_cols
    ]
    cat_col_lookup_stripped_to_full = dict(zip(cat_cols_stripped, cat_cols))
    cat_col_lookup_full_to_stripped = dict(zip(cat_cols, cat_cols_stripped))

    for i, cluster in enumerate(clusters):
        clusters[i] = [
            cat_col_lookup_stripped_to_full[link] for link in cluster
        ]
        clusters[i].sort()

    # fuse every cluster iteratively

    if progress:
        iterator = tqdm(enumerate(clusters), desc="Data Fuser - Fusing.")
    else:
        iterator = enumerate(clusters)

    for i, cluster in iterator:
        # use names without prefices to generate name of fused column by
        # combining them

        cluster_in_df = df.loc[:, cluster]
        suffix_col_names = [
            cat_col_lookup_full_to_stripped[name] for name in cluster
        ]
        suffix = "_".join(suffix_col_names)
        fused_name = "fused_" + suffix

        # detect type of columns to merge and use the appropriate function
        type_conditions = [
            cluster_in_df.applymap(lambda x: type(x) == bool).any().all(),
            cluster_in_df.applymap(
                lambda x: np.logical_or(type(x) == int,
                                        type(x) == float)).all().all(),
            cluster_in_df.applymap(lambda x: type(x) == str).any().all()
        ]

        type_choices = ["boolean", "numeric", "string"]
        type_ = np.select(type_conditions, type_choices, default=None).item()

        # detect if single match or multiple matches
        if len(cluster) == 2:
            size = "_single"
        else:
            size = "_multiple"

        # look up function to use for fusion
        method = type_ + size
        function = function_lookup[method]

        if not function:
            warnings.warn(
                "No correct function for {method} was specified, the cluster {cluster} cannot be fused"
                .format(method=method, cluster=cluster))
            continue

        # create newly fused column and drop the old ones

        if function == provenance:
            # special case because it refers to provenance in column names not
            # to single values
            column_to_keep = provenance(cluster_in_df.columns,
                                        provenance_regex)
            df = df.rename(columns={column_to_keep: "fused_" + column_to_keep})
            columns_to_drop = [
                col for col in cluster_in_df.columns if col != column_to_keep
            ]
            df = df.drop(columns_to_drop, axis=1)

        else:
            df[fused_name] = cluster_in_df.apply(function, axis=1)
            df = df.drop(cluster_in_df.columns, axis=1)

    return df