示例#1
0
def label_linker(
    df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, result_filter=None,
    language="en", max_hits=1, label_property="rdfs:label",prefix_lookup=False, caching=True):
    """Label Linker takes attributes from a column and adds a new column with
    the respective knowledge graph links based on the provided label_property
    (rdfs:label by default).

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        column (str): Name of the column whose entities should be found.
        new_attribute_name (str, optional): Name of column containing the link 
            to the knowledge graph. Defaults to "new_link".
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        endpoint (Endpoint, optional): Choose SPARQL endpoint connection. 
            Defaults to DBpedia.
        result_filter (list, optional): A list filled with regexes (as strings) 
            to filter the results. Defaults to None.
        language (str, optional): Restrict search to labels with a certain 
            language tag. Set to None if restriction is needed. Defaults to 
            "en".
        max_hits (int, optional): Maximal number of URI's that should be 
            returned per entity. Defaults to 1.
        label_property (str, optional): Specifies the label_property the should 
            be used in the query. Defaults to "rdfs:label".
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.
            
    Returns:
        pd.DataFrame: Dataframe with a new column containing the links to the
        knowledge graph.
    """

    df = df.copy()

    result_df = pd.DataFrame()

    if progress:
        iterator = tqdm(df[column].iteritems(), total=df.shape[0])
    else:
        iterator = df[column].iteritems()

    for col in iterator:

        if not pd.isnull(col[1]):
            query = "SELECT DISTINCT ?label ?uri WHERE { ?uri "+label_property+" ?label . filter"

            if language != None:

                query = query + "(?label =\"" + col[1] + "\"@" + language

            else:

                query = query + "(str(?label) =\"" + col[1] + "\""
                
            if result_filter != None:

                query = query + \
                        " && ("+regex_string_generator("?uri",
                                                        result_filter)+")"

            query = query + ")}"
            
            if max_hits:
                query = query + " LIMIT " + str(max_hits)

            result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching)
            result_df = result_df.append(result)

    result_df = result_df.reset_index(drop=True)

    if result_df.empty:

        df[new_attribute_name+"_1"] = np.nan

        return df

    else:

        result_df_grouped = result_df.groupby("label")["uri"].apply(
            lambda x: pd.Series(x.values)).unstack()
        result_df_grouped = result_df_grouped.rename(
            columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])})
        result_df_grouped = result_df_grouped.reset_index()

        df = pd.merge(df, result_df_grouped.drop_duplicates(), left_on=column,
                      right_on="label", how="outer").drop("label", axis=1)

    return df
示例#2
0
def sameas_linker(
    df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, 
    result_filter=None, uri_data_model=False, bundled_mode=True, 
    prefix_lookup=False, caching=True):
    """Function that takes URIs from a column of a DataFrame and queries a
    given SPARQL endpoint for ressources which are connected to these URIs via
    owl:sameAs. Found ressources are added as new columns to the dataframe and
    the dataframe is returned.

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        column (str): Name of the column for whose entities links should be
            found.
        new_attribute_name (str, optional): Name / prefix of the column(s)  
            containing the found links. Defaults to "new_link".
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        result_filter (list, optional): A list filled with regexes (as strings) 
            to filter the results. Defaults to None.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        bundled_mode (bool, optional): If True, all necessary queries are   
            boundled into one querie (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation!. Defaults to True.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Returns dataframe with (a) new column(s) containing the
        found ressources.
    """

    df = df.copy()

    if bundled_mode and not uri_data_model:

        values = " ( <"+df[column].str.cat(sep="> ) ( <")+"> ) "

        query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {" + \
            values+"} ?value owl:sameAs ?sameas_uris . "

        if result_filter != None:

            query = query + \
                "FILTER("+regex_string_generator("?sameas_uris", result_filter)+") "

        query = query+"}"

        result_df = endpoint_wrapper(
            query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates()

    else:

        result_df = pd.DataFrame()

        if uri_data_model:

            query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {(<**URI**>)} ?value owl:sameAs ?sameas_uris . "

            if result_filter != None:

                query = query + \
                    "FILTER("+regex_string_generator("str(?sameas_uris)",
                                                     result_filter)+") "

            query = query+"}"

            result_df = uri_querier(
                df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching)

        else:

            if progress:
                iterator = tqdm(df[column].iteritems(), total=df.shape[0])
            else:
                iterator = df[column].iteritems()

            for uri in iterator:

                if pd.isna(uri[1]):

                    pass

                else:

                    query = " SELECT DISTINCT ?value ?sameas_uris WHERE {?value owl:sameAs ?sameas_uris. FILTER (?value = <"+uri[
                        1]+">"

                    if result_filter != None:

                        query = query + \
                            " && ("+regex_string_generator("?sameas_uris",
                                                           result_filter)+")"

                    query = query+") }"

                    result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching)

                    result_df = result_df.append(result)

        result_df = result_df.rename(
            {"callret-0": "value"}, axis="columns").drop_duplicates().reset_index(drop=True)

    if result_df.empty:

        df[new_attribute_name+"_1"] = np.nan

        return df

    else:

        result_df_grouped = result_df.groupby("value")

        result_df_grouped = result_df_grouped["sameas_uris"].apply(
            lambda x: pd.Series(x.values)).unstack()
        result_df_grouped = result_df_grouped.rename(
            columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])})

        df = pd.merge(df, result_df_grouped, left_on=column,
                      right_on="value", how="outer")

        return df
示例#3
0
def unqualified_relation_generator(df,
                                   columns,
                                   endpoint=DBpedia,
                                   uri_data_model=False,
                                   progress=True,
                                   prefix="Link",
                                   direction="Out",
                                   regex_filter=None,
                                   result_type="boolean",
                                   prefix_lookup=False,
                                   caching=True):
    """Unqualified relation generator creates attributes from the existence of 
    relations and adds boolean, counts, relative counts or tfidf-values features
    for incoming and outgoing relations.

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        prefix (str, optional): Custom prefix for the SPARQL query. Defauls to 
            "Link".
        direction (str, optional): The direction for properties which choose 
            from Incoming, Outgoing (In and Out). Defaults to "Out".
        regex_filter (str, optional): Regular expression for filtering 
            properties. Defaults to None.
        result_type (str, optional): States wether the results should be 
            boolean ("boolean"), counts ("counts"), relative counts 
            ("relative") or tfidf-values ("tfidf") Defaults to "boolean".
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Dataframe with new columns containing the links of 
        properties to the knowledge graph
    """

    df = df.copy()

    #convert columns to list to enable iteration
    if not isinstance(columns, list):

        columns = [columns]

    #iterate over possibly several link columns
    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if not uri_data_model:

            values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) "

            if direction == "Out":

                query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) {" + values + "} ?value ?p ?o "

            elif direction == "In":

                query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) {" + values + "} ?s ?p ?value "

            if regex_filter != None:

                regex_string = regex_string_generator("?p", regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            if direction == "Out":

                query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) { (<**URI**>)} ?value ?p ?o "

            elif direction == "In":

                query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) { (<**URI**>)} ?s ?p ?value "

            if regex_filter != None:

                regex_string = regex_string_generator("str(?p)", regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = uri_querier(df,
                                    col,
                                    query,
                                    prefix_lookup=prefix_lookup,
                                    progress=progress,
                                    caching=caching)

    if type(result_df) != type(pd.DataFrame()):

        pass

    if result_df.empty:

        pass

    else:

        result_df_dummies = result_df.join(
            result_df["p"].str.get_dummies()).drop("p", axis=1)

        result_df = get_result_df(
            result_df_dummies, result_type,
            prefix + "_" + direction + "_" + result_type + "_", df, columns)

    return result_df
示例#4
0
def qualified_relation_generator(df,
                                 columns,
                                 endpoint=DBpedia,
                                 uri_data_model=False,
                                 progress=True,
                                 prefix="Link",
                                 direction="Out",
                                 properties_regex_filter=None,
                                 types_regex_filter=None,
                                 result_type="boolean",
                                 hierarchy=False,
                                 prefix_lookup=False,
                                 caching=True):
    """Qualified relation generator considers not only relations, but also the 
    related types, adding boolean, counts, relative counts or tfidf-values 
    features for incoming and outgoing relations.

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        prefix (str, optional): Custom prefix for the SPARQL query. Defauls to 
            "Link".
        direction (str, optional): The direction for properties which choose 
            from Incoming, Outgoing (In and Out). Defaults to "Out".
        properties_regex_filter (str, optional): Regular expression for 
            filtering properties. Defaults to None.
        types_regex_filter (str, optional): Regular expression for filtering 
            types. Defaults to None.
        result_type (str, optional): States wether the results should be 
            boolean ("boolean"), counts ("counts"), relative counts 
            ("relative") or tfidf-values ("tfidf") Defaults to "boolean".
        hierarchy (bool, optional): If True, a hierarchy of all superclasses of 
            the returned types is attached to the resulting dataframe. Defaults 
            to False.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph
    """

    df = df.copy()

    if hierarchy:
        hierarchyGraph = nx.DiGraph()

    #convert columns to list to enable iteration
    if not isinstance(columns, list):

        columns = [columns]

    #iterate over possibly several link columns
    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if not uri_data_model:

            values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) "

            if direction == "Out":

                query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {" + values + "} ?value ?p ?o. ?o rdf:type ?type. "

            elif direction == "In":

                query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {" + values + "} ?s ?p ?value. ?s rdf:type ?type. "

            if properties_regex_filter != None:

                regex_string = regex_string_generator("?p",
                                                      properties_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            if types_regex_filter != None:

                regex_string = regex_string_generator("?type",
                                                      types_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            if direction == "Out":

                query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?o. ?o rdf:type ?type. "

            elif direction == "In":

                query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {(<**URI**>)} ?s ?p ?value. ?s rdf:type ?type. "

            if properties_regex_filter != None:

                regex_string = regex_string_generator("str(?p)",
                                                      properties_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            if types_regex_filter != None:

                regex_string = regex_string_generator("str(?type)",
                                                      types_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = uri_querier(df,
                                    col,
                                    query,
                                    prefix_lookup=prefix_lookup,
                                    progress=progress,
                                    caching=caching)

    if type(result_df) != type(pd.DataFrame()):

        pass

    if result_df.empty:

        pass

    else:
        if hierarchy:

            hierarchy_col = hierarchy_graph_generator(
                result_df["type"],
                hierarchy_relation=
                "http://www.w3.org/2000/01/rdf-schema#subClassOf",
                max_hierarchy_depth=None,
                endpoint=endpoint,
                uri_data_model=uri_data_model,
                progress=progress,
                caching=caching)

            hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col)

        result_df[
            "link_with_type"] = result_df["p"] + "_type_" + result_df["type"]

        result_df = result_df[["value", "link_with_type"]]

        result_df_dummies = result_df.join(
            result_df["link_with_type"].str.get_dummies()).drop(
                "link_with_type", axis=1)

        result_df = get_result_df(
            result_df_dummies, result_type,
            prefix + "_" + direction + "_" + result_type + "_", df, columns)

    if hierarchy:
        # append hierarchy to df as attribute, this will generate a warning but works
        result_df.attrs = {"hierarchy": hierarchyGraph}

    return result_df
示例#5
0
def direct_type_generator(df,
                          columns,
                          endpoint=DBpedia,
                          uri_data_model=False,
                          progress=True,
                          prefix="",
                          regex_filter=None,
                          result_type="boolean",
                          bundled_mode=True,
                          hierarchy=False,
                          prefix_lookup=False,
                          caching=True):
    """Generator that takes a dataset with (a) link(s) to a knowledge graph and
    queries the type(s) of the linked ressources (using rdf:type). The
    resulting types are added as new columns, which are filled either with a
    boolean indicator or a count.

    Args:
        df (pd.DataFrame): Dataframe to which types are added.
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL . Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process . Defaults 
            to True.
        prefix (str, optional): Custom prefix for the SPARQL query. Defaults to 
            "".
        regex_filter (list, optional): A list filled with regexes (as strings) 
            to filter the results . Defaults to None.
        result_type (str, optional): States wether the results should be 
            boolean ("boolean"), counts ("counts"), relative counts 
            ("relative") or tfidf-values ("tfidf") . Defaults to "boolean".
        bundled_mode (bool, optional): If True, all necessary queries are 
            bundled into one query (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation! . Defaults to True.
        hierarchy (bool, optional): If True, a hierarchy of all superclasses of 
            the returned types is attached to the resulting dataframe. Defaults 
            to False.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Returns dataframe with (a) new column(s) containing the 
        found types.
    """

    df = df.copy()

    final_result_df = pd.DataFrame()

    if hierarchy:
        hierarchyGraph = nx.DiGraph()

    # convert columns to list to enable iteration
    if not isinstance(columns, list):
        columns = [columns]

    # Create SPARQL query (based on rdf:type) for each user-specified column

    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for column in iterator:

        # If bundled_mode is selected all necessary queries for a column are bundled into one query (using the VALUES method). -> Way faster But less compatible.

        if bundled_mode and not uri_data_model:

            values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) "

            query = prefix + \
                " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {" + \
                values+"} ?value rdf:type ?types . "

            if regex_filter != None:

                regex_string = regex_string_generator("?types", regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            result_df = pd.DataFrame()

            if uri_data_model:

                query = prefix + \
                    " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {(<**URI**>)} ?value rdf:type ?types . "

                if regex_filter != None:

                    regex_string = regex_string_generator(
                        "str(?types)", regex_filter)

                    query = query + "FILTER(" + regex_string + ") "

                query = query + "}"

                result_df = uri_querier(df,
                                        column,
                                        query,
                                        prefix_lookup=prefix_lookup,
                                        progress=progress,
                                        caching=caching)

            else:

                for uri in df[column].iteritems():

                    if pd.notna(uri[1]):

                        query = prefix + \
                            " SELECT DISTINCT ?value ?types WHERE {?value rdf:type ?types . FILTER (?value = <" + \
                            uri[1]+">"

                        if regex_filter != None:

                            query = query + " && (" + regex_string_generator(
                                "?types", regex_filter) + ")"

                        query = query + ") }"

                        result = endpoint_wrapper(query,
                                                  endpoint,
                                                  prefix_lookup=prefix_lookup,
                                                  caching=caching)

                        result_df = result_df.append(result)

                    else:
                        pass

            result_df = result_df.rename(
                {
                    "callret-0": "value"
                }, axis="columns").drop_duplicates().reset_index(drop=True)

        if hierarchy:
            hierarchy_col = hierarchy_graph_generator(
                result_df["types"],
                hierarchy_relation=
                "http://www.w3.org/2000/01/rdf-schema#subClassOf",
                max_hierarchy_depth=None,
                endpoint=endpoint,
                uri_data_model=uri_data_model,
                progress=progress,
                caching=caching)

            hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col)

        if result_df.empty:

            result_columns = []
            pass

        else:

            # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is one row

            result_df_dummies = result_df.join(
                result_df.types.str.get_dummies()).drop("types", axis=1)

            # Sparse dataframe is grouped by uri

            result_df_grouped = result_df_dummies.groupby("value").sum()

            # Result columns get prefix (format depends on single or multiple columns)

            if len(columns) > 1:

                result_df_grouped = result_df_grouped.add_prefix("type_")

            else:

                result_df_grouped = result_df_grouped.add_prefix(column +
                                                                 "_type_")

            # Results get concatenated to the queried columns (to be used as identifiers) (??)

            result_df_merged = pd.merge(df[columns],
                                        result_df_grouped,
                                        left_on=column,
                                        right_on="value",
                                        how="outer").drop_duplicates()

            # If multiple columns with URIs are looked up: Current results are merged with the results of previous passes of the loop

            final_result_df = pd.concat([final_result_df, result_df_merged],
                                        sort=False).groupby(
                                            columns,
                                            dropna=False).sum().reset_index()

            # Result columns are determined and converted to the correct dtype

            result_columns = list(
                set(list(final_result_df.columns)) - set(columns))

            final_result_df[result_columns] = final_result_df[
                result_columns].astype("int64")

    if not final_result_df.empty:

        # If result_type is boolean, all values greater 0 are changed to True all others to False

        if result_type == "boolean":

            final_result_df[result_columns] = final_result_df[
                result_columns].astype("bool")

        # If result_type is "relative" or "tfidf", calculate the relative counts per row

        elif result_type in ["relative", "tfidf"]:

            # Calculate the relative counts by dividing each row by its sum, fillna(0) to replace missings created by division by zero (when sum=0)
            final_result_df_relative = final_result_df.copy()

            final_result_df_relative[result_columns] = final_result_df[
                result_columns].div(
                    final_result_df[result_columns].sum(axis=1),
                    axis=0).fillna(0)

            # If result_type is "tfidf", use the table of relative counts to create the table of tfidf-values

            if result_type == "tfidf":

                # Calculate idf values

                N = len(final_result_df[result_columns])

                nt = final_result_df[result_columns][
                    final_result_df[result_columns] >= 1].count(axis=0)

                idf = np.log(N / nt).replace(np.inf, 0)

                # Multiply relative counts with idf values

                final_result_df_relative[
                    result_columns] = final_result_df_relative[
                        result_columns].multiply(idf, axis="columns")

            final_result_df = final_result_df_relative.copy()

        # Collected query-results get appended to the original dataframe

        df = pd.merge(df, final_result_df, on=columns, how="outer")

    if hierarchy:
        df.attrs = {"hierarchy": hierarchyGraph}

    return df
示例#6
0
    def test5_wrongconnector(self):

        with pytest.raises(ValueError):
            regex_string_generator("?type", ["[^i*&2@]"], "NOR")
示例#7
0
    def test4_and(self):

        assert regex_string_generator(
            "?type", ["[2-9]|[12]\\d|3[0-6]", "^dog", "b[aeiou]bble"], "AND"
        ) == "regex(?type, \"[2-9]|[12]\\d|3[0-6]\") && regex(?type, \"^dog\") && regex(?type, \"b[aeiou]bble\")"
示例#8
0
    def test3_and(self):

        assert regex_string_generator("?type", ["[^i*&2@]"],
                                      "AND") == "regex(?type, \"[^i*&2@]\")"
示例#9
0
    def test1(self):

        assert regex_string_generator(
            "?type", ["[^i*&2@]"]) == "regex(?type, \"[^i*&2@]\")"