def label_linker( df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, result_filter=None, language="en", max_hits=1, label_property="rdfs:label",prefix_lookup=False, caching=True): """Label Linker takes attributes from a column and adds a new column with the respective knowledge graph links based on the provided label_property (rdfs:label by default). Args: df (pd.DataFrame): Dataframe to which links are added. column (str): Name of the column whose entities should be found. new_attribute_name (str, optional): Name of column containing the link to the knowledge graph. Defaults to "new_link". progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. endpoint (Endpoint, optional): Choose SPARQL endpoint connection. Defaults to DBpedia. result_filter (list, optional): A list filled with regexes (as strings) to filter the results. Defaults to None. language (str, optional): Restrict search to labels with a certain language tag. Set to None if restriction is needed. Defaults to "en". max_hits (int, optional): Maximal number of URI's that should be returned per entity. Defaults to 1. label_property (str, optional): Specifies the label_property the should be used in the query. Defaults to "rdfs:label". prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with a new column containing the links to the knowledge graph. """ df = df.copy() result_df = pd.DataFrame() if progress: iterator = tqdm(df[column].iteritems(), total=df.shape[0]) else: iterator = df[column].iteritems() for col in iterator: if not pd.isnull(col[1]): query = "SELECT DISTINCT ?label ?uri WHERE { ?uri "+label_property+" ?label . filter" if language != None: query = query + "(?label =\"" + col[1] + "\"@" + language else: query = query + "(str(?label) =\"" + col[1] + "\"" if result_filter != None: query = query + \ " && ("+regex_string_generator("?uri", result_filter)+")" query = query + ")}" if max_hits: query = query + " LIMIT " + str(max_hits) result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) result_df = result_df.reset_index(drop=True) if result_df.empty: df[new_attribute_name+"_1"] = np.nan return df else: result_df_grouped = result_df.groupby("label")["uri"].apply( lambda x: pd.Series(x.values)).unstack() result_df_grouped = result_df_grouped.rename( columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])}) result_df_grouped = result_df_grouped.reset_index() df = pd.merge(df, result_df_grouped.drop_duplicates(), left_on=column, right_on="label", how="outer").drop("label", axis=1) return df
def sameas_linker( df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, result_filter=None, uri_data_model=False, bundled_mode=True, prefix_lookup=False, caching=True): """Function that takes URIs from a column of a DataFrame and queries a given SPARQL endpoint for ressources which are connected to these URIs via owl:sameAs. Found ressources are added as new columns to the dataframe and the dataframe is returned. Args: df (pd.DataFrame): Dataframe to which links are added. column (str): Name of the column for whose entities links should be found. new_attribute_name (str, optional): Name / prefix of the column(s) containing the found links. Defaults to "new_link". progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. result_filter (list, optional): A list filled with regexes (as strings) to filter the results. Defaults to None. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. bundled_mode (bool, optional): If True, all necessary queries are boundled into one querie (using the VALUES method). - Requires a SPARQL 1.1 implementation!. Defaults to True. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Returns dataframe with (a) new column(s) containing the found ressources. """ df = df.copy() if bundled_mode and not uri_data_model: values = " ( <"+df[column].str.cat(sep="> ) ( <")+"> ) " query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {" + \ values+"} ?value owl:sameAs ?sameas_uris . " if result_filter != None: query = query + \ "FILTER("+regex_string_generator("?sameas_uris", result_filter)+") " query = query+"}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates() else: result_df = pd.DataFrame() if uri_data_model: query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {(<**URI**>)} ?value owl:sameAs ?sameas_uris . " if result_filter != None: query = query + \ "FILTER("+regex_string_generator("str(?sameas_uris)", result_filter)+") " query = query+"}" result_df = uri_querier( df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) else: if progress: iterator = tqdm(df[column].iteritems(), total=df.shape[0]) else: iterator = df[column].iteritems() for uri in iterator: if pd.isna(uri[1]): pass else: query = " SELECT DISTINCT ?value ?sameas_uris WHERE {?value owl:sameAs ?sameas_uris. FILTER (?value = <"+uri[ 1]+">" if result_filter != None: query = query + \ " && ("+regex_string_generator("?sameas_uris", result_filter)+")" query = query+") }" result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) result_df = result_df.rename( {"callret-0": "value"}, axis="columns").drop_duplicates().reset_index(drop=True) if result_df.empty: df[new_attribute_name+"_1"] = np.nan return df else: result_df_grouped = result_df.groupby("value") result_df_grouped = result_df_grouped["sameas_uris"].apply( lambda x: pd.Series(x.values)).unstack() result_df_grouped = result_df_grouped.rename( columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])}) df = pd.merge(df, result_df_grouped, left_on=column, right_on="value", how="outer") return df
def unqualified_relation_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="Link", direction="Out", regex_filter=None, result_type="boolean", prefix_lookup=False, caching=True): """Unqualified relation generator creates attributes from the existence of relations and adds boolean, counts, relative counts or tfidf-values features for incoming and outgoing relations. Args: df (pd.DataFrame): Dataframe to which links are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defauls to "Link". direction (str, optional): The direction for properties which choose from Incoming, Outgoing (In and Out). Defaults to "Out". regex_filter (str, optional): Regular expression for filtering properties. Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") Defaults to "boolean". prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph """ df = df.copy() #convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] #iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) " if direction == "Out": query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) {" + values + "} ?value ?p ?o " elif direction == "In": query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) {" + values + "} ?s ?p ?value " if regex_filter != None: regex_string = regex_string_generator("?p", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: if direction == "Out": query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) { (<**URI**>)} ?value ?p ?o " elif direction == "In": query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) { (<**URI**>)} ?s ?p ?value " if regex_filter != None: regex_string = regex_string_generator("str(?p)", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) if type(result_df) != type(pd.DataFrame()): pass if result_df.empty: pass else: result_df_dummies = result_df.join( result_df["p"].str.get_dummies()).drop("p", axis=1) result_df = get_result_df( result_df_dummies, result_type, prefix + "_" + direction + "_" + result_type + "_", df, columns) return result_df
def qualified_relation_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="Link", direction="Out", properties_regex_filter=None, types_regex_filter=None, result_type="boolean", hierarchy=False, prefix_lookup=False, caching=True): """Qualified relation generator considers not only relations, but also the related types, adding boolean, counts, relative counts or tfidf-values features for incoming and outgoing relations. Args: df (pd.DataFrame): Dataframe to which links are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defauls to "Link". direction (str, optional): The direction for properties which choose from Incoming, Outgoing (In and Out). Defaults to "Out". properties_regex_filter (str, optional): Regular expression for filtering properties. Defaults to None. types_regex_filter (str, optional): Regular expression for filtering types. Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") Defaults to "boolean". hierarchy (bool, optional): If True, a hierarchy of all superclasses of the returned types is attached to the resulting dataframe. Defaults to False. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph """ df = df.copy() if hierarchy: hierarchyGraph = nx.DiGraph() #convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] #iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) " if direction == "Out": query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {" + values + "} ?value ?p ?o. ?o rdf:type ?type. " elif direction == "In": query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {" + values + "} ?s ?p ?value. ?s rdf:type ?type. " if properties_regex_filter != None: regex_string = regex_string_generator("?p", properties_regex_filter) query = query + "FILTER(" + regex_string + ") " if types_regex_filter != None: regex_string = regex_string_generator("?type", types_regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: if direction == "Out": query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?o. ?o rdf:type ?type. " elif direction == "In": query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {(<**URI**>)} ?s ?p ?value. ?s rdf:type ?type. " if properties_regex_filter != None: regex_string = regex_string_generator("str(?p)", properties_regex_filter) query = query + "FILTER(" + regex_string + ") " if types_regex_filter != None: regex_string = regex_string_generator("str(?type)", types_regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) if type(result_df) != type(pd.DataFrame()): pass if result_df.empty: pass else: if hierarchy: hierarchy_col = hierarchy_graph_generator( result_df["type"], hierarchy_relation= "http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col) result_df[ "link_with_type"] = result_df["p"] + "_type_" + result_df["type"] result_df = result_df[["value", "link_with_type"]] result_df_dummies = result_df.join( result_df["link_with_type"].str.get_dummies()).drop( "link_with_type", axis=1) result_df = get_result_df( result_df_dummies, result_type, prefix + "_" + direction + "_" + result_type + "_", df, columns) if hierarchy: # append hierarchy to df as attribute, this will generate a warning but works result_df.attrs = {"hierarchy": hierarchyGraph} return result_df
def direct_type_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="", regex_filter=None, result_type="boolean", bundled_mode=True, hierarchy=False, prefix_lookup=False, caching=True): """Generator that takes a dataset with (a) link(s) to a knowledge graph and queries the type(s) of the linked ressources (using rdf:type). The resulting types are added as new columns, which are filled either with a boolean indicator or a count. Args: df (pd.DataFrame): Dataframe to which types are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL . Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process . Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defaults to "". regex_filter (list, optional): A list filled with regexes (as strings) to filter the results . Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") . Defaults to "boolean". bundled_mode (bool, optional): If True, all necessary queries are bundled into one query (using the VALUES method). - Requires a SPARQL 1.1 implementation! . Defaults to True. hierarchy (bool, optional): If True, a hierarchy of all superclasses of the returned types is attached to the resulting dataframe. Defaults to False. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Returns dataframe with (a) new column(s) containing the found types. """ df = df.copy() final_result_df = pd.DataFrame() if hierarchy: hierarchyGraph = nx.DiGraph() # convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] # Create SPARQL query (based on rdf:type) for each user-specified column if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for column in iterator: # If bundled_mode is selected all necessary queries for a column are bundled into one query (using the VALUES method). -> Way faster But less compatible. if bundled_mode and not uri_data_model: values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) " query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {" + \ values+"} ?value rdf:type ?types . " if regex_filter != None: regex_string = regex_string_generator("?types", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: result_df = pd.DataFrame() if uri_data_model: query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {(<**URI**>)} ?value rdf:type ?types . " if regex_filter != None: regex_string = regex_string_generator( "str(?types)", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) else: for uri in df[column].iteritems(): if pd.notna(uri[1]): query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {?value rdf:type ?types . FILTER (?value = <" + \ uri[1]+">" if regex_filter != None: query = query + " && (" + regex_string_generator( "?types", regex_filter) + ")" query = query + ") }" result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) else: pass result_df = result_df.rename( { "callret-0": "value" }, axis="columns").drop_duplicates().reset_index(drop=True) if hierarchy: hierarchy_col = hierarchy_graph_generator( result_df["types"], hierarchy_relation= "http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col) if result_df.empty: result_columns = [] pass else: # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is one row result_df_dummies = result_df.join( result_df.types.str.get_dummies()).drop("types", axis=1) # Sparse dataframe is grouped by uri result_df_grouped = result_df_dummies.groupby("value").sum() # Result columns get prefix (format depends on single or multiple columns) if len(columns) > 1: result_df_grouped = result_df_grouped.add_prefix("type_") else: result_df_grouped = result_df_grouped.add_prefix(column + "_type_") # Results get concatenated to the queried columns (to be used as identifiers) (??) result_df_merged = pd.merge(df[columns], result_df_grouped, left_on=column, right_on="value", how="outer").drop_duplicates() # If multiple columns with URIs are looked up: Current results are merged with the results of previous passes of the loop final_result_df = pd.concat([final_result_df, result_df_merged], sort=False).groupby( columns, dropna=False).sum().reset_index() # Result columns are determined and converted to the correct dtype result_columns = list( set(list(final_result_df.columns)) - set(columns)) final_result_df[result_columns] = final_result_df[ result_columns].astype("int64") if not final_result_df.empty: # If result_type is boolean, all values greater 0 are changed to True all others to False if result_type == "boolean": final_result_df[result_columns] = final_result_df[ result_columns].astype("bool") # If result_type is "relative" or "tfidf", calculate the relative counts per row elif result_type in ["relative", "tfidf"]: # Calculate the relative counts by dividing each row by its sum, fillna(0) to replace missings created by division by zero (when sum=0) final_result_df_relative = final_result_df.copy() final_result_df_relative[result_columns] = final_result_df[ result_columns].div( final_result_df[result_columns].sum(axis=1), axis=0).fillna(0) # If result_type is "tfidf", use the table of relative counts to create the table of tfidf-values if result_type == "tfidf": # Calculate idf values N = len(final_result_df[result_columns]) nt = final_result_df[result_columns][ final_result_df[result_columns] >= 1].count(axis=0) idf = np.log(N / nt).replace(np.inf, 0) # Multiply relative counts with idf values final_result_df_relative[ result_columns] = final_result_df_relative[ result_columns].multiply(idf, axis="columns") final_result_df = final_result_df_relative.copy() # Collected query-results get appended to the original dataframe df = pd.merge(df, final_result_df, on=columns, how="outer") if hierarchy: df.attrs = {"hierarchy": hierarchyGraph} return df
def test5_wrongconnector(self): with pytest.raises(ValueError): regex_string_generator("?type", ["[^i*&2@]"], "NOR")
def test4_and(self): assert regex_string_generator( "?type", ["[2-9]|[12]\\d|3[0-6]", "^dog", "b[aeiou]bble"], "AND" ) == "regex(?type, \"[2-9]|[12]\\d|3[0-6]\") && regex(?type, \"^dog\") && regex(?type, \"b[aeiou]bble\")"
def test3_and(self): assert regex_string_generator("?type", ["[^i*&2@]"], "AND") == "regex(?type, \"[^i*&2@]\")"
def test1(self): assert regex_string_generator( "?type", ["[^i*&2@]"]) == "regex(?type, \"[^i*&2@]\")"