def test2_timeout_nocache(self): dbpedia = RemoteEndpoint("http://dbpedia.org/sparql/", timeout=1, retries=0) query = "SELECT ?label ?uri WHERE { ?uri rdfs:label ?label . filter (str(?label) =\"test\")}" capturedOutput = io.StringIO() sys.stdout = capturedOutput endpoint_wrapper(query, dbpedia, caching=False) assert capturedOutput.getvalue() == 'timed out\n'
def test6_prefix_lookup_true(self): query = "SELECT DISTINCT ?name WHERE {<http://dbpedia.org/resource/Bavaria> dbp:name ?name }" expected_result = pd.DataFrame({"name": ["Free State of Bavaria"]}) result = endpoint_wrapper(query, DBpedia, prefix_lookup=True) pd.testing.assert_frame_equal(result, expected_result, check_like=True)
def test4_initial_offset_nocache(self): dbpedia = RemoteEndpoint("http://dbpedia.org/sparql/", page_size=1) query = "SELECT DISTINCT ?uri WHERE { ?uri rdfs:label ?label . filter(?label =\"Bayern\"@en)} LIMIT 1 OFFSET 2" expected_result = pd.DataFrame( {"uri": ["http://www.wikidata.org/entity/Q4874432"]}) result = endpoint_wrapper(query, dbpedia, caching=True) pd.testing.assert_frame_equal(result, expected_result, check_like=True)
def test7_prefix_lookup_json(self): query = "SELECT DISTINCT ?homepage WHERE {<http://dbpedia.org/resource/Michael_Wendler> der-wendler:homepage ?homepage }" expected_result = pd.DataFrame( {"homepage": ["http://www.michaelwendler.de/"]}) result = endpoint_wrapper( query, DBpedia, prefix_lookup="test/data/sparql_helper/prefixes_test7.json") pd.testing.assert_frame_equal(result, expected_result, check_like=True)
def test8_prefix_lookup_dict(self): prefix_dict = {"someprefix": "http://www.w3.org/2000/01/rdf-schema#"} query = "SELECT DISTINCT ?we_need WHERE {<http://dbpedia.org/resource/Beer> someprefix:label ?we_need}" expected_result = pd.DataFrame({ "we_need": [ 'Bier', 'ビール', 'Beer', 'جعة', 'Cerveza', 'Bière', 'Birra', 'Bier', 'Piwo', 'Cerveja', 'Пиво', '啤酒' ] }) result = endpoint_wrapper(query, DBpedia, prefix_lookup=prefix_dict) pd.testing.assert_frame_equal(result, expected_result, check_like=True)
def test1_pagesize(self): dbpedia = RemoteEndpoint("http://dbpedia.org/sparql/", page_size=1) query = "SELECT DISTINCT ?uri WHERE { ?uri rdfs:label ?label . filter(?label =\"Bayern\"@en)}" expected_result = pd.DataFrame({ "uri": [ "http://dbpedia.org/resource/Bayern", "http://www.wikidata.org/entity/Q255654", "http://www.wikidata.org/entity/Q4874432", "http://www.wikidata.org/entity/Q18148056" ], }) result = endpoint_wrapper(query, dbpedia) pd.testing.assert_frame_equal(result, expected_result, check_like=True)
def relational_matching(df, endpoints=[DBpedia, WikiData], uri_data_model=False, match_score=1, progress=True, caching=True): """Creates a mapping of matching attributes in the schema by checking for owl:sameAs, owl:equivalentClass, owl:Equivalent and wdt:P1628 links between them. Args: df (pd.DataFrame): Dataframe where matching attributes are supposed to be found. endpoints (list, optional): SPARQL Endpoint to be queried. Defaults to [DBpedia, WikiData]. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. match_score (int, optional): Score of the match: 0 < match_score <= 1. Defaults to 1. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Two columns with matching links and a third column with the score, which is always one in case of the relational matching unless specified otherwise. """ matches = pd.DataFrame(columns=["uri_1", "uri_2", "value"]) # determine attribute columns cat_cols = [col for col in df.columns if re.findall("http:", col)] cat_cols_stripped = [ re.sub(r"^.*http://", "http://", col) for col in cat_cols ] if not cat_cols: return matches # transform attributes to sparql values list form values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) " if uri_data_model: # formulate query query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/>" query += "SELECT ?value ?object WHERE {VALUES (?value) { (<**URI**>)}" query += " ?value\ (owl:equivalentProperty|owl:equivalentClass|owl:sameAs|wdt:P1628)\ ?object. }" temp_df = pd.DataFrame(cat_cols_stripped, columns=["values"]) same_cats = uri_querier(temp_df, "values", query, caching=caching, progress=progress) if same_cats.empty: return matches else: same_cats = same_cats.drop( same_cats[same_cats["value"] == same_cats["object"]].index) else: if not isinstance(endpoints, list): endpoints = [endpoints] same_cats = pd.DataFrame(columns=["value", "object"]) for endpoint in endpoints: # formulate query query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/>" query += "SELECT ?value ?object WHERE {VALUES (?value) {" query += values query += "} ?value\ (owl:equivalentProperty|owl:equivalentClass|owl:sameAs|wdt:P1628)\ ?object. }" # query the equivalent classes/properties query_result = endpoint_wrapper(query, endpoint, caching=caching) if not query_result.empty: query_result = query_result.drop_duplicates().\ reset_index(drop=True) # group equivalent classes/properties for each original attribute same_cats = same_cats.append(query_result, ignore_index=True) if same_cats.empty: return matches combinations = list(itertools.combinations(cat_cols_stripped, 2)) combinations_sorted = pd.DataFrame([sorted(x) for x in combinations], columns=["uri_1", "uri_2"]) # detect matches in the attributes for _, row in same_cats.iterrows(): if row["object"] in cat_cols_stripped: # if there is a match insert it in alphabetical order into the # output matches dataframe new_match = { "uri_1": min(row["value"], row["object"]), "uri_2": max(row["value"], row["object"]), "value": match_score } matches = matches.append(new_match, ignore_index=True) matches = matches.drop_duplicates() full_matches = combinations_sorted.merge(matches, on=["uri_1", "uri_2"], how="outer") full_matches["value"] = np.where(full_matches["value"].isna(), 0, full_matches["value"]) return full_matches
def custom_sparql_generator(df, link_attribute, query, endpoint=DBpedia, progress=True, attribute_generation_strategy="first", prefix_lookup=False, caching=True): """This generator issues a custom SPARQL query and creates additional attributes from the query results. Args: df (pd.DataFrame): Dataframe to which links are added link_attribute (str): Name of column containing the link to the knowledge graph. query (str): Custom SPARQL query which returns attributes to be appended. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with new columns containing the query results. """ # TODO: Add attribute generation strategy to Docstring variable = re.search(r"\*.*\*", query).group().replace("*", "") var_index = df.columns.get_loc(variable) df_result = pd.DataFrame() if progress: iterator = tqdm(df.iterrows(), total=df.shape[0], desc="Row") else: iterator = df.iterrows() for row in iterator: query_temp = re.sub(r"\*.*\*", "<" + str(row[1].iloc[var_index]) + ">", query) df_temp = pd.DataFrame([row[1].iloc[var_index]], columns=["link_attribute"]) df_temp = pd.concat([ df_temp, endpoint_wrapper(query_temp, endpoint, caching=caching).head(1) ], axis=1) df_result = pd.concat([df_result, df_temp], ignore_index=True, sort=True) df = pd.merge(df, df_result.drop_duplicates(), left_on=link_attribute, right_on="link_attribute", how="left") df.drop("link_attribute", axis=1, inplace=True) return df
def specific_relation_generator( df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, direct_relation="http://purl.org/dc/terms/subject", hierarchy_relation=None, max_hierarchy_depth=1, prefix_lookup=False, caching=True): """Creates attributes from a specific direct relation. Additionally, it is possible to append a hierarchy with a user-defined hierarchy relation. Args: df (pd.DataFrame): the dataframe to extend columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. direct_relation (str, optional): Direct relation used to create features. Defaults to "http://purl.org/dc/terms/subject". hierarchy_relation (str, optional): Hierarchy relation used to connect categories, e.g. http://www.w3.org/2004/02/skos/core#broader. Defaults to None. max_hierarchy_depth (int, optional): Maximal number of hierarchy steps taken. Defaults to 1. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: The dataframe with additional features. """ df = df.copy() if hierarchy_relation: hierarchy_relation = re.sub(r"^.*?https://", "http://", hierarchy_relation) hierarchy = nx.DiGraph() direct_relation = re.sub(r"^.*?https://", "http://", direct_relation) # convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] if df[columns].isna().all().item(): return df # iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: # Create Sparql Query values = "(<" + df[col].str.cat(sep=">) (<") + ">) " query = "SELECT ?value ?object " query += " WHERE {VALUES (?value) {" + values query += "} ?value (<" + direct_relation + ">) ?object. }" # Retrieve query results from endpoint query_result = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).\ drop_duplicates().reset_index(drop=True) else: # Create URI Query query = "SELECT ?value ?object WHERE {VALUES (?value) {(<**URI**>)}" query += " ?value (<" + direct_relation + ">) ?object. }" query_result = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) # delete empty columns (for example when hierarchy relation returns # nothing) query_result = query_result.dropna(how="all", axis=1) # check if there are valid results, if not return the original frame if query_result.empty: continue # extract hierarchy if hierarchy_relation: hierarchy_col = hierarchy_graph_generator( query_result["object"], hierarchy_relation=hierarchy_relation, max_hierarchy_depth=max_hierarchy_depth, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchy = nx.compose(hierarchy, hierarchy_col) query_grouped = query_result.groupby("value")["object"].apply(list) # bundle the unique new features new_cols = pd.Series(query_grouped.values.sum()).unique() # create shape of result dataframe to fill df_to_append = pd.DataFrame(columns=new_cols) df_to_append["value"] = query_grouped.index # check for each URI if it belongs to the category and tick True/False for row, new_col in itertools.product(df_to_append.index, new_cols): df_to_append.loc[row, new_col] = np.where( new_col in query_grouped[df_to_append.loc[row, "value"]], True, False).item() # merge the new column with the original dataframe df_to_append.rename({"value": col}, axis=1, inplace=True) df = pd.merge(df, df_to_append, how="left", on=col) # rename columns if new_cols.any(): df.columns = [ col + "_in_boolean_" + name if name in new_cols else name for name in df.columns ] # append hierarchy to df as attribute, this will generate a warning but # works if hierarchy_relation: df.attrs = {"hierarchy": hierarchy} return df
def qualified_relation_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="Link", direction="Out", properties_regex_filter=None, types_regex_filter=None, result_type="boolean", hierarchy=False, prefix_lookup=False, caching=True): """Qualified relation generator considers not only relations, but also the related types, adding boolean, counts, relative counts or tfidf-values features for incoming and outgoing relations. Args: df (pd.DataFrame): Dataframe to which links are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defauls to "Link". direction (str, optional): The direction for properties which choose from Incoming, Outgoing (In and Out). Defaults to "Out". properties_regex_filter (str, optional): Regular expression for filtering properties. Defaults to None. types_regex_filter (str, optional): Regular expression for filtering types. Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") Defaults to "boolean". hierarchy (bool, optional): If True, a hierarchy of all superclasses of the returned types is attached to the resulting dataframe. Defaults to False. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph """ df = df.copy() if hierarchy: hierarchyGraph = nx.DiGraph() #convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] #iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) " if direction == "Out": query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {" + values + "} ?value ?p ?o. ?o rdf:type ?type. " elif direction == "In": query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {" + values + "} ?s ?p ?value. ?s rdf:type ?type. " if properties_regex_filter != None: regex_string = regex_string_generator("?p", properties_regex_filter) query = query + "FILTER(" + regex_string + ") " if types_regex_filter != None: regex_string = regex_string_generator("?type", types_regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: if direction == "Out": query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?o. ?o rdf:type ?type. " elif direction == "In": query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {(<**URI**>)} ?s ?p ?value. ?s rdf:type ?type. " if properties_regex_filter != None: regex_string = regex_string_generator("str(?p)", properties_regex_filter) query = query + "FILTER(" + regex_string + ") " if types_regex_filter != None: regex_string = regex_string_generator("str(?type)", types_regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) if type(result_df) != type(pd.DataFrame()): pass if result_df.empty: pass else: if hierarchy: hierarchy_col = hierarchy_graph_generator( result_df["type"], hierarchy_relation= "http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col) result_df[ "link_with_type"] = result_df["p"] + "_type_" + result_df["type"] result_df = result_df[["value", "link_with_type"]] result_df_dummies = result_df.join( result_df["link_with_type"].str.get_dummies()).drop( "link_with_type", axis=1) result_df = get_result_df( result_df_dummies, result_type, prefix + "_" + direction + "_" + result_type + "_", df, columns) if hierarchy: # append hierarchy to df as attribute, this will generate a warning but works result_df.attrs = {"hierarchy": hierarchyGraph} return result_df
def unqualified_relation_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="Link", direction="Out", regex_filter=None, result_type="boolean", prefix_lookup=False, caching=True): """Unqualified relation generator creates attributes from the existence of relations and adds boolean, counts, relative counts or tfidf-values features for incoming and outgoing relations. Args: df (pd.DataFrame): Dataframe to which links are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defauls to "Link". direction (str, optional): The direction for properties which choose from Incoming, Outgoing (In and Out). Defaults to "Out". regex_filter (str, optional): Regular expression for filtering properties. Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") Defaults to "boolean". prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph """ df = df.copy() #convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] #iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) " if direction == "Out": query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) {" + values + "} ?value ?p ?o " elif direction == "In": query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) {" + values + "} ?s ?p ?value " if regex_filter != None: regex_string = regex_string_generator("?p", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: if direction == "Out": query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) { (<**URI**>)} ?value ?p ?o " elif direction == "In": query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) { (<**URI**>)} ?s ?p ?value " if regex_filter != None: regex_string = regex_string_generator("str(?p)", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) if type(result_df) != type(pd.DataFrame()): pass if result_df.empty: pass else: result_df_dummies = result_df.join( result_df["p"].str.get_dummies()).drop("p", axis=1) result_df = get_result_df( result_df_dummies, result_type, prefix + "_" + direction + "_" + result_type + "_", df, columns) return result_df
def sameas_linker( df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, result_filter=None, uri_data_model=False, bundled_mode=True, prefix_lookup=False, caching=True): """Function that takes URIs from a column of a DataFrame and queries a given SPARQL endpoint for ressources which are connected to these URIs via owl:sameAs. Found ressources are added as new columns to the dataframe and the dataframe is returned. Args: df (pd.DataFrame): Dataframe to which links are added. column (str): Name of the column for whose entities links should be found. new_attribute_name (str, optional): Name / prefix of the column(s) containing the found links. Defaults to "new_link". progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. result_filter (list, optional): A list filled with regexes (as strings) to filter the results. Defaults to None. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. bundled_mode (bool, optional): If True, all necessary queries are boundled into one querie (using the VALUES method). - Requires a SPARQL 1.1 implementation!. Defaults to True. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Returns dataframe with (a) new column(s) containing the found ressources. """ df = df.copy() if bundled_mode and not uri_data_model: values = " ( <"+df[column].str.cat(sep="> ) ( <")+"> ) " query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {" + \ values+"} ?value owl:sameAs ?sameas_uris . " if result_filter != None: query = query + \ "FILTER("+regex_string_generator("?sameas_uris", result_filter)+") " query = query+"}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates() else: result_df = pd.DataFrame() if uri_data_model: query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {(<**URI**>)} ?value owl:sameAs ?sameas_uris . " if result_filter != None: query = query + \ "FILTER("+regex_string_generator("str(?sameas_uris)", result_filter)+") " query = query+"}" result_df = uri_querier( df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) else: if progress: iterator = tqdm(df[column].iteritems(), total=df.shape[0]) else: iterator = df[column].iteritems() for uri in iterator: if pd.isna(uri[1]): pass else: query = " SELECT DISTINCT ?value ?sameas_uris WHERE {?value owl:sameAs ?sameas_uris. FILTER (?value = <"+uri[ 1]+">" if result_filter != None: query = query + \ " && ("+regex_string_generator("?sameas_uris", result_filter)+")" query = query+") }" result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) result_df = result_df.rename( {"callret-0": "value"}, axis="columns").drop_duplicates().reset_index(drop=True) if result_df.empty: df[new_attribute_name+"_1"] = np.nan return df else: result_df_grouped = result_df.groupby("value") result_df_grouped = result_df_grouped["sameas_uris"].apply( lambda x: pd.Series(x.values)).unstack() result_df_grouped = result_df_grouped.rename( columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])}) df = pd.merge(df, result_df_grouped, left_on=column, right_on="value", how="outer") return df
def test5_wrong_endpointtype(self): with pytest.raises(TypeError): endpoint_wrapper("test_query", "http://dbpedia.org/sparql/")
def label_schema_matching(df, endpoint=DBpedia, uri_data_model=False, to_lowercase=True, remove_prefixes=True, remove_punctuation=True, prefix_threshold=1, progress=True, caching=True): """A schema matching method by checking for attribute -- rdfs:label between links. Args: df (pd.DataFrame): The dataframe where matching attributes are supposed to be found. endpoint (Endpoint, optional): SPARQL Endpoint to be queried. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. to_lowercase (bool, optional): Converts queried strings to lowercase. Defaults to True. remove_prefixes (bool, optional): Removes prefices of queried strings. Defaults to True. remove_punctuation (bool, optional): Removes punctuation from queried strings. Defaults to True. prefix_threshold (int, optional): The number of occurences after which a prefix is considered "common". Defaults to 1. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Two columns with matching links and a third column with the overlapped label. """ matches = pd.DataFrame(columns=["uri_1", "uri_2", "same_label"]) # Get URIs from the column names cat_cols = [col for col in df.columns if re.findall("https*:", col)] cat_cols_stripped = [ re.sub(r"^.*http://", "http://", col) for col in cat_cols ] # transform attributes to sparql values list form values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) " if uri_data_model: # Query these URIs for the label query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }" labels = uri_querier( pd.DataFrame(cat_cols_stripped), 0, query, progress=progress, caching=caching).drop_duplicates().set_index("value") else: query = "SELECT ?value ?o WHERE {VALUES (?value) {" + values + \ "} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }" # query the equivalent classes/properties labels = endpoint_wrapper(query, endpoint, caching=caching).reset_index(drop=True) if labels.empty: return matches # Get common prefixes common_prefixes = get_common_prefixes(labels, prefix_threshold) # Clean the results (i.e. the labels) labels["o"] = labels["o"].apply(lambda x: clean_string( x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation)) # Create a dictionary if labels.index.name == "value": labels.reset_index(inplace=True) labels_dict = labels.set_index("value").T.to_dict("list") #check if there are no matches tmp = set() for v in labels_dict.values(): tmp.update(v) if len(labels_dict) == len(tmp): combinations = list(itertools.combinations(cat_cols_stripped, 2)) combinations_sorted = [sorted(x) for x in combinations] matches = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) matches["same_label"] = 0 return matches else: # Combine the uris that have the same labels into a DataFrame new_labels_dict = collections.defaultdict(list) for key, values in labels_dict.items(): for i in values: new_labels_dict[i].append(key) df_labels = pd.DataFrame(list(new_labels_dict.values()), columns=["uri_1", "uri_2"]) #df_labels["same_label"] = pd.DataFrame(list(new_labels_dict.keys())) df_labels.dropna(inplace=True) # restrict the order of uris in one row for _, row in df_labels.iterrows(): new_match = { "uri_1": min(row["uri_1"], row["uri_2"]), "uri_2": max(row["uri_1"], row["uri_2"]), "same_label": 1 } matches = matches.append(new_match, ignore_index=True) # Get back the uris that are not quired by rdfs:label and turn df into dict no_label = pd.DataFrame({ "value": [x for x in cat_cols_stripped if x not in list(labels["value"])], "o": np.nan }) labels = labels.append(no_label, ignore_index=True) full_labels_dict = labels.set_index("value").T.to_dict("list") # Create all unique combinations from the URIs, order them alphabetically and turn them into a DataFrame combinations = list(itertools.combinations(full_labels_dict.keys(), 2)) combinations_sorted = [sorted(x) for x in combinations] result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) # merged with the non_matched combinations and drop duplicates for _, row in result.iterrows(): new_match = { "uri_1": min(row["uri_1"], row["uri_2"]), "uri_2": max(row["uri_1"], row["uri_2"]), "same_label": 0 } matches = matches.append(new_match, ignore_index=True) matches.drop_duplicates(subset=["uri_1", "uri_2"], inplace=True, ignore_index=True) return matches
def data_properties_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, type_filter=None, regex_filter=None, bundled_mode=True, prefix_lookup=False, caching=True): """Generator that takes a dataset with a link to a knowledge graph and creates a new feature for each data property of the given resource. Args: df (pd.DataFrame): Dataframe to which the features will be added columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): Base string to the knowledge graph; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. type_filter (str, optional): Property datatype to be selected from results (e.g. xsd:string). If a specific datatype should be excluded a "- " needs to be prepended (e.g. - xsd:string). Defaults to None. regex_filter (str, optional): Regular expression for filtering properties. Defaults to None. bundled_mode (bool, optional): If True, all necessary queries are bundled into one query (using the VALUES method). - Requires a SPARQL 1.1 implementation! . Defaults to True. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with a new column for each property. """ df = df.copy() # convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] # Prepare Type Filter Statement (Decode Include/Exclude) if type_filter != None: if type_filter[0:2] == "- ": type_filter_str = " && DATATYPE(?v) != " + type_filter[2:] else: type_filter_str = " && DATATYPE(?v) = " + type_filter # Create SPARQL query for each user-specified column if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if bundled_mode and not uri_data_model: values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) " query = "SELECT ?value ?p ?v WHERE {VALUES (?value) {" + \ values + "} ?value ?p ?v FILTER(isLITERAL(?v)" if type_filter != None: query = query + type_filter_str if regex_filter != None: query = query + " && regex(?p, \"" + regex_filter + "\")" query = query + ")}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: result_df = pd.DataFrame() if uri_data_model: query = "SELECT DISTINCT ?value ?p ?v WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?v FILTER(isLITERAL(?v)" if type_filter != None: query = query + type_filter_str if regex_filter != None: query = query + " && regex(?p, \"" + regex_filter + "\")" query = query + ")}" result_df = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) else: for uri in df[col].iteritems(): if pd.notna(uri[1]): query = "SELECT DISTINCT ?value ?p ?v WHERE {?value ?p ?v . FILTER (?value = <" + \ uri[1]+"> && (isLITERAL(?v))" if type_filter != None: query = query + type_filter_str if regex_filter != None: query = query + " && regex(?p, \"" + regex_filter + "\")" query = query + ")} " result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) else: pass if result_df.empty: pass else: # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is on row result_df["p"] = col + "_data_" + result_df["p"] # transform values into new columns result_df = result_df.pivot_table(values="v", index="value", columns="p", aggfunc=np.random.choice) # append properties to dataframe df = pd.merge(df, result_df, how="left", left_on=col, right_on="value") return df
def label_linker( df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, result_filter=None, language="en", max_hits=1, label_property="rdfs:label",prefix_lookup=False, caching=True): """Label Linker takes attributes from a column and adds a new column with the respective knowledge graph links based on the provided label_property (rdfs:label by default). Args: df (pd.DataFrame): Dataframe to which links are added. column (str): Name of the column whose entities should be found. new_attribute_name (str, optional): Name of column containing the link to the knowledge graph. Defaults to "new_link". progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. endpoint (Endpoint, optional): Choose SPARQL endpoint connection. Defaults to DBpedia. result_filter (list, optional): A list filled with regexes (as strings) to filter the results. Defaults to None. language (str, optional): Restrict search to labels with a certain language tag. Set to None if restriction is needed. Defaults to "en". max_hits (int, optional): Maximal number of URI's that should be returned per entity. Defaults to 1. label_property (str, optional): Specifies the label_property the should be used in the query. Defaults to "rdfs:label". prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with a new column containing the links to the knowledge graph. """ df = df.copy() result_df = pd.DataFrame() if progress: iterator = tqdm(df[column].iteritems(), total=df.shape[0]) else: iterator = df[column].iteritems() for col in iterator: if not pd.isnull(col[1]): query = "SELECT DISTINCT ?label ?uri WHERE { ?uri "+label_property+" ?label . filter" if language != None: query = query + "(?label =\"" + col[1] + "\"@" + language else: query = query + "(str(?label) =\"" + col[1] + "\"" if result_filter != None: query = query + \ " && ("+regex_string_generator("?uri", result_filter)+")" query = query + ")}" if max_hits: query = query + " LIMIT " + str(max_hits) result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) result_df = result_df.reset_index(drop=True) if result_df.empty: df[new_attribute_name+"_1"] = np.nan return df else: result_df_grouped = result_df.groupby("label")["uri"].apply( lambda x: pd.Series(x.values)).unstack() result_df_grouped = result_df_grouped.rename( columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])}) result_df_grouped = result_df_grouped.reset_index() df = pd.merge(df, result_df_grouped.drop_duplicates(), left_on=column, right_on="label", how="outer").drop("label", axis=1) return df
def direct_type_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="", regex_filter=None, result_type="boolean", bundled_mode=True, hierarchy=False, prefix_lookup=False, caching=True): """Generator that takes a dataset with (a) link(s) to a knowledge graph and queries the type(s) of the linked ressources (using rdf:type). The resulting types are added as new columns, which are filled either with a boolean indicator or a count. Args: df (pd.DataFrame): Dataframe to which types are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL . Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process . Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defaults to "". regex_filter (list, optional): A list filled with regexes (as strings) to filter the results . Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") . Defaults to "boolean". bundled_mode (bool, optional): If True, all necessary queries are bundled into one query (using the VALUES method). - Requires a SPARQL 1.1 implementation! . Defaults to True. hierarchy (bool, optional): If True, a hierarchy of all superclasses of the returned types is attached to the resulting dataframe. Defaults to False. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Returns dataframe with (a) new column(s) containing the found types. """ df = df.copy() final_result_df = pd.DataFrame() if hierarchy: hierarchyGraph = nx.DiGraph() # convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] # Create SPARQL query (based on rdf:type) for each user-specified column if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for column in iterator: # If bundled_mode is selected all necessary queries for a column are bundled into one query (using the VALUES method). -> Way faster But less compatible. if bundled_mode and not uri_data_model: values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) " query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {" + \ values+"} ?value rdf:type ?types . " if regex_filter != None: regex_string = regex_string_generator("?types", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: result_df = pd.DataFrame() if uri_data_model: query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {(<**URI**>)} ?value rdf:type ?types . " if regex_filter != None: regex_string = regex_string_generator( "str(?types)", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) else: for uri in df[column].iteritems(): if pd.notna(uri[1]): query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {?value rdf:type ?types . FILTER (?value = <" + \ uri[1]+">" if regex_filter != None: query = query + " && (" + regex_string_generator( "?types", regex_filter) + ")" query = query + ") }" result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) else: pass result_df = result_df.rename( { "callret-0": "value" }, axis="columns").drop_duplicates().reset_index(drop=True) if hierarchy: hierarchy_col = hierarchy_graph_generator( result_df["types"], hierarchy_relation= "http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col) if result_df.empty: result_columns = [] pass else: # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is one row result_df_dummies = result_df.join( result_df.types.str.get_dummies()).drop("types", axis=1) # Sparse dataframe is grouped by uri result_df_grouped = result_df_dummies.groupby("value").sum() # Result columns get prefix (format depends on single or multiple columns) if len(columns) > 1: result_df_grouped = result_df_grouped.add_prefix("type_") else: result_df_grouped = result_df_grouped.add_prefix(column + "_type_") # Results get concatenated to the queried columns (to be used as identifiers) (??) result_df_merged = pd.merge(df[columns], result_df_grouped, left_on=column, right_on="value", how="outer").drop_duplicates() # If multiple columns with URIs are looked up: Current results are merged with the results of previous passes of the loop final_result_df = pd.concat([final_result_df, result_df_merged], sort=False).groupby( columns, dropna=False).sum().reset_index() # Result columns are determined and converted to the correct dtype result_columns = list( set(list(final_result_df.columns)) - set(columns)) final_result_df[result_columns] = final_result_df[ result_columns].astype("int64") if not final_result_df.empty: # If result_type is boolean, all values greater 0 are changed to True all others to False if result_type == "boolean": final_result_df[result_columns] = final_result_df[ result_columns].astype("bool") # If result_type is "relative" or "tfidf", calculate the relative counts per row elif result_type in ["relative", "tfidf"]: # Calculate the relative counts by dividing each row by its sum, fillna(0) to replace missings created by division by zero (when sum=0) final_result_df_relative = final_result_df.copy() final_result_df_relative[result_columns] = final_result_df[ result_columns].div( final_result_df[result_columns].sum(axis=1), axis=0).fillna(0) # If result_type is "tfidf", use the table of relative counts to create the table of tfidf-values if result_type == "tfidf": # Calculate idf values N = len(final_result_df[result_columns]) nt = final_result_df[result_columns][ final_result_df[result_columns] >= 1].count(axis=0) idf = np.log(N / nt).replace(np.inf, 0) # Multiply relative counts with idf values final_result_df_relative[ result_columns] = final_result_df_relative[ result_columns].multiply(idf, axis="columns") final_result_df = final_result_df_relative.copy() # Collected query-results get appended to the original dataframe df = pd.merge(df, final_result_df, on=columns, how="outer") if hierarchy: df.attrs = {"hierarchy": hierarchyGraph} return df
def hierarchy_graph_generator( col, hierarchy_relation="http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=DBpedia, uri_data_model=False, progress=False, caching=True): """Computes a hierarchy graph from an original set of features, where directed edges symbolise a hierarchy relation from subclass to superclass. Args: col (pd.Series): The classes/categories for which the hierarchy graph is generated. hierarchy_relation (str, optional): The hierarchy relation to be used. Defaults to "http://www.w3.org/2000/01/rdf-schema#subClassOf". max_hierarchy_depth (int, optional): Number of jumps in hierarchy. If None, transitive jumps are used. Defaults to None. endpoint (Endpoint, optional): Link to the SPARQL endpoint that should be queried. Defaults to DBpedia. uri_data_model (bool, optional): whether to use sparql querier or the uri data model. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Returns: nx.DirectedGraph: Graph where edges point to direct superclasses of nodes. """ # warn if wrong configurations are used and correct them cond_subclass = hierarchy_relation ==\ "http://www.w3.org/2000/01/rdf-schema#subClassOf" if cond_subclass and max_hierarchy_depth: warnings.warn("""If you use subClass with a maximum hierarchy depth, meaningless superclasses are generated. Max_hierarchy_depth is set to None instead""") max_hierarchy_depth = None cond_broader= hierarchy_relation ==\ "http://www.w3.org/2004/02/skos/core#broader" if cond_broader and max_hierarchy_depth is None: warnings.warn("""Transitive superclass generation does not work for categories. Max_hierarchy_depth is set to 1. For higher depths, set max_hierarchy_depth to a higher integer""") max_hierarchy_depth = 1 # Initialise the graph DG = nx.DiGraph() # if column contains only missings return empty graph if col.isna().all(): return DG current_level = col.copy() # in this case the query contains all future hierarchy levels and queries # them directly if max_hierarchy_depth and not uri_data_model: query = hierarchy_query_creator(col, hierarchy_relation, max_hierarchy_depth, uri_data_model) results = endpoint_wrapper(query, endpoint, return_XML=True, caching=caching) DG, _ = create_graph_from_raw(DG, results, max_hierarchy_depth, None, uri_data_model) # here the "broader" steps have to be added sequentially from level to # level until the max_hierarchy_depth is reached elif max_hierarchy_depth and uri_data_model: hierarchy_level = 0 while not current_level.empty and hierarchy_level < max_hierarchy_depth: query = hierarchy_query_creator(current_level, hierarchy_relation, max_hierarchy_depth, uri_data_model) temp_frame = pd.DataFrame(current_level) results = uri_querier(temp_frame, current_level.name, query, progress=progress, caching=caching) current_level = list() DG, current_level = create_graph_from_raw(DG, results, max_hierarchy_depth, current_level, uri_data_model) hierarchy_level += 1 # iteratively loop from hierarchy level to hierarchy level until no # more superclasses are found --> transitive without maximum else: while not current_level.empty: query = hierarchy_query_creator(current_level, hierarchy_relation, max_hierarchy_depth, uri_data_model) if uri_data_model: temp_frame = pd.DataFrame(current_level) results = uri_querier(temp_frame, current_level.name, query, progress=progress, caching=caching) else: results = endpoint_wrapper(query, endpoint, return_XML=True, caching=caching) current_level = list() DG, current_level = create_graph_from_raw(DG, results, max_hierarchy_depth, current_level, uri_data_model) # Find cycles and break them while not nx.is_directed_acyclic_graph(DG): try: cycle = nx.find_cycle(DG) backwards_path = cycle[1] DG.remove_edge(*backwards_path) except nx.NetworkXNoCycle: pass return DG
def check_uri_redirects( df, column, replace=True, custom_name_postfix=None, redirection_property="http://dbpedia.org/ontology/wikiPageRedirects", endpoint=DBpedia, regex_filter="dbpedia", bundled_mode=True, uri_data_model=False, progress=True, caching=True): """Takes a column of URIs from a DataFrame and checks for each if it has a redirection set by the endpoint. If this is the case, the URI it redirects to is either added in a new column or replaces the original URI. Args: df (pd.DataFrame): Dataframe for which the URIs should be inspected. column (str): Name of the column that contains the URIs that should be checked. replace (bool, optional): If True: URIs that get redirected will be replaced with the new URI; If False: A new column, containing the result for each URI, is added to the DataFrame. Defaults to True. custom_name_postfix (str, optional): Custom postfix for the newly created column (in case "replace" is set to False). Defaults to None. redirection_property (str, optional): Relation/Property URI that signals a redirect for this endpoint. Defaults to "http://dbpedia.org/ontology/wikiPageRedirects". endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. regex_filter (str, optional): Just URIs matching the specified RegEx are checked for redirects. Defaults to "dbpedia". bundled_mode (bool, optional): If True, all necessary queries are bundled into one query (using the VALUES method). - Requires a SPARQL 1.1 implementation!; ignored when "uri_data_model" = True. Defaults to True. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Raises: ValueError: Raised if 'custom_name_postfix' is set to "" instead of None. Returns: pd.DataFrame: Returns dataframe with cleaned links / a new column. """ if custom_name_postfix == "": raise ValueError( "'custom_name_postfix' can't be an empty string. If you don't want to use a custom_name_postfix, please set the attribute to None" ) df = df.copy() if bundled_mode and not uri_data_model: values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) " query = "SELECT DISTINCT ?value ?redirect WHERE {VALUES (?value) {" + values + "} ?value <" + redirection_property + "> ?redirect . }" result_df = endpoint_wrapper( query, endpoint, caching=caching).drop_duplicates().reset_index(drop=True) else: result_df = pd.DataFrame() if uri_data_model: query = "SELECT DISTINCT ?value ?redirect WHERE {VALUES (?value) {(<**URI**>)} ?value <" + redirection_property + "> ?redirect . }" result_df = uri_querier(df, column, query, regex_filter=regex_filter, progress=progress, caching=caching) else: for uri in df[column].iteritems(): if pd.notna(uri[1]): query = "SELECT DISTINCT ?value ?redirect WHERE {?value <" + redirection_property + "> ?redirect . FILTER (?value = <" + uri[ 1] + ">) }" result = endpoint_wrapper(query, endpoint, caching=caching) result_df = result_df.append(result) else: pass result_df = result_df.rename({ "callret-0": "value" }, axis="columns").drop_duplicates().reset_index(drop=True) if result_df.empty: return df else: if custom_name_postfix == None: new_attribute_name = column + "_redirect" else: new_attribute_name = column + custom_name_postfix result_df = pd.merge(df, result_df, how="left", left_on=column, right_on="value").drop("value", axis=1).rename( columns={"redirect": new_attribute_name}) if replace: result_df.loc[(pd.isnull(result_df[new_attribute_name])), new_attribute_name] = result_df[column] result_df.drop(column, axis=1, inplace=True) result_df.rename(columns={new_attribute_name: column}, inplace=True) return result_df