def test3_nocaching(self): # Not all attributes (here "adresse") are present for all of the URIs. input = pd.DataFrame({ 'uris': [ 'http://dbpedia.org/resource/Berlin', 'http://dbpedia.org/resource/Darmstadt', 'http://dbpedia.org/resource/London', 'http://dbpedia.org/resource/Munich' ] }) query = "SELECT DISTINCT ?uri ?adresse ?lat WHERE {VALUES (?uri) {(<**URI**>)} ?uri dbp:adresse ?adresse. ?uri geo:lat ?lat} ORDER BY ?adresse LIMIT 2" result = uri_querier(input, "uris", query, caching=False) expected_result_df = pd.DataFrame({ 'uri': [ 'http://dbpedia.org/resource/Darmstadt', 'http://dbpedia.org/resource/Darmstadt', 'http://dbpedia.org/resource/Munich', 'http://dbpedia.org/resource/Munich' ], 'adresse': ['Luisenplatz 5', '64283', 'Marienplatz 8', '80331'], 'lat': [49.8667, 49.8667, 48.1333, 48.1333] }) pd.testing.assert_frame_equal(result, expected_result_df, check_like=True)
def test4_brokenuris(self): # Not all attributes (here "adresse") are present for all of the URIs. input = pd.DataFrame({ 'uris': [ 'https://www.dd', 'www.google.de', 'https://www.google.de', 'http://dbpedia.org/resource/Munich', np.nan ] }) query = "SELECT DISTINCT ?uri ?adresse ?lat WHERE {VALUES (?uri) {(<**URI**>)} ?uri dbp:adresse ?adresse. ?uri geo:lat ?lat} ORDER BY ?adresse LIMIT 2" with pytest.warns(UserWarning) as record: result = uri_querier(input, "uris", query, progress=True) assert len(record) == 3 assert record[0].message.args[ 0] == "https://www.dd is not a valid URI." assert record[1].message.args[ 0] == "www.google.de might not be a valid URI." assert record[2].message.args[ 0] == "https://www.google.de might not be dereferencable." expected_result_df = pd.DataFrame({ 'uri': [ 'http://dbpedia.org/resource/Munich', 'http://dbpedia.org/resource/Munich' ], 'adresse': ['Marienplatz 8', '80331'], 'lat': [48.1333, 48.1333] }) pd.testing.assert_frame_equal(result, expected_result_df, check_like=True)
def hierarchy_graph_generator( col, hierarchy_relation="http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=DBpedia, uri_data_model=False, progress=False, caching=True): """Computes a hierarchy graph from an original set of features, where directed edges symbolise a hierarchy relation from subclass to superclass. Args: col (pd.Series): The classes/categories for which the hierarchy graph is generated. hierarchy_relation (str, optional): The hierarchy relation to be used. Defaults to "http://www.w3.org/2000/01/rdf-schema#subClassOf". max_hierarchy_depth (int, optional): Number of jumps in hierarchy. If None, transitive jumps are used. Defaults to None. endpoint (Endpoint, optional): Link to the SPARQL endpoint that should be queried. Defaults to DBpedia. uri_data_model (bool, optional): whether to use sparql querier or the uri data model. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Returns: nx.DirectedGraph: Graph where edges point to direct superclasses of nodes. """ # warn if wrong configurations are used and correct them cond_subclass = hierarchy_relation ==\ "http://www.w3.org/2000/01/rdf-schema#subClassOf" if cond_subclass and max_hierarchy_depth: warnings.warn("""If you use subClass with a maximum hierarchy depth, meaningless superclasses are generated. Max_hierarchy_depth is set to None instead""") max_hierarchy_depth = None cond_broader= hierarchy_relation ==\ "http://www.w3.org/2004/02/skos/core#broader" if cond_broader and max_hierarchy_depth is None: warnings.warn("""Transitive superclass generation does not work for categories. Max_hierarchy_depth is set to 1. For higher depths, set max_hierarchy_depth to a higher integer""") max_hierarchy_depth = 1 # Initialise the graph DG = nx.DiGraph() # if column contains only missings return empty graph if col.isna().all(): return DG current_level = col.copy() # in this case the query contains all future hierarchy levels and queries # them directly if max_hierarchy_depth and not uri_data_model: query = hierarchy_query_creator(col, hierarchy_relation, max_hierarchy_depth, uri_data_model) results = endpoint_wrapper(query, endpoint, return_XML=True, caching=caching) DG, _ = create_graph_from_raw(DG, results, max_hierarchy_depth, None, uri_data_model) # here the "broader" steps have to be added sequentially from level to # level until the max_hierarchy_depth is reached elif max_hierarchy_depth and uri_data_model: hierarchy_level = 0 while not current_level.empty and hierarchy_level < max_hierarchy_depth: query = hierarchy_query_creator(current_level, hierarchy_relation, max_hierarchy_depth, uri_data_model) temp_frame = pd.DataFrame(current_level) results = uri_querier(temp_frame, current_level.name, query, progress=progress, caching=caching) current_level = list() DG, current_level = create_graph_from_raw(DG, results, max_hierarchy_depth, current_level, uri_data_model) hierarchy_level += 1 # iteratively loop from hierarchy level to hierarchy level until no # more superclasses are found --> transitive without maximum else: while not current_level.empty: query = hierarchy_query_creator(current_level, hierarchy_relation, max_hierarchy_depth, uri_data_model) if uri_data_model: temp_frame = pd.DataFrame(current_level) results = uri_querier(temp_frame, current_level.name, query, progress=progress, caching=caching) else: results = endpoint_wrapper(query, endpoint, return_XML=True, caching=caching) current_level = list() DG, current_level = create_graph_from_raw(DG, results, max_hierarchy_depth, current_level, uri_data_model) # Find cycles and break them while not nx.is_directed_acyclic_graph(DG): try: cycle = nx.find_cycle(DG) backwards_path = cycle[1] DG.remove_edge(*backwards_path) except nx.NetworkXNoCycle: pass return DG
def label_schema_matching(df, endpoint=DBpedia, uri_data_model=False, to_lowercase=True, remove_prefixes=True, remove_punctuation=True, prefix_threshold=1, progress=True, caching=True): """A schema matching method by checking for attribute -- rdfs:label between links. Args: df (pd.DataFrame): The dataframe where matching attributes are supposed to be found. endpoint (Endpoint, optional): SPARQL Endpoint to be queried. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. to_lowercase (bool, optional): Converts queried strings to lowercase. Defaults to True. remove_prefixes (bool, optional): Removes prefices of queried strings. Defaults to True. remove_punctuation (bool, optional): Removes punctuation from queried strings. Defaults to True. prefix_threshold (int, optional): The number of occurences after which a prefix is considered "common". Defaults to 1. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Two columns with matching links and a third column with the overlapped label. """ matches = pd.DataFrame(columns=["uri_1", "uri_2", "same_label"]) # Get URIs from the column names cat_cols = [col for col in df.columns if re.findall("https*:", col)] cat_cols_stripped = [ re.sub(r"^.*http://", "http://", col) for col in cat_cols ] # transform attributes to sparql values list form values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) " if uri_data_model: # Query these URIs for the label query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }" labels = uri_querier( pd.DataFrame(cat_cols_stripped), 0, query, progress=progress, caching=caching).drop_duplicates().set_index("value") else: query = "SELECT ?value ?o WHERE {VALUES (?value) {" + values + \ "} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }" # query the equivalent classes/properties labels = endpoint_wrapper(query, endpoint, caching=caching).reset_index(drop=True) if labels.empty: return matches # Get common prefixes common_prefixes = get_common_prefixes(labels, prefix_threshold) # Clean the results (i.e. the labels) labels["o"] = labels["o"].apply(lambda x: clean_string( x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation)) # Create a dictionary if labels.index.name == "value": labels.reset_index(inplace=True) labels_dict = labels.set_index("value").T.to_dict("list") #check if there are no matches tmp = set() for v in labels_dict.values(): tmp.update(v) if len(labels_dict) == len(tmp): combinations = list(itertools.combinations(cat_cols_stripped, 2)) combinations_sorted = [sorted(x) for x in combinations] matches = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) matches["same_label"] = 0 return matches else: # Combine the uris that have the same labels into a DataFrame new_labels_dict = collections.defaultdict(list) for key, values in labels_dict.items(): for i in values: new_labels_dict[i].append(key) df_labels = pd.DataFrame(list(new_labels_dict.values()), columns=["uri_1", "uri_2"]) #df_labels["same_label"] = pd.DataFrame(list(new_labels_dict.keys())) df_labels.dropna(inplace=True) # restrict the order of uris in one row for _, row in df_labels.iterrows(): new_match = { "uri_1": min(row["uri_1"], row["uri_2"]), "uri_2": max(row["uri_1"], row["uri_2"]), "same_label": 1 } matches = matches.append(new_match, ignore_index=True) # Get back the uris that are not quired by rdfs:label and turn df into dict no_label = pd.DataFrame({ "value": [x for x in cat_cols_stripped if x not in list(labels["value"])], "o": np.nan }) labels = labels.append(no_label, ignore_index=True) full_labels_dict = labels.set_index("value").T.to_dict("list") # Create all unique combinations from the URIs, order them alphabetically and turn them into a DataFrame combinations = list(itertools.combinations(full_labels_dict.keys(), 2)) combinations_sorted = [sorted(x) for x in combinations] result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) # merged with the non_matched combinations and drop duplicates for _, row in result.iterrows(): new_match = { "uri_1": min(row["uri_1"], row["uri_2"]), "uri_2": max(row["uri_1"], row["uri_2"]), "same_label": 0 } matches = matches.append(new_match, ignore_index=True) matches.drop_duplicates(subset=["uri_1", "uri_2"], inplace=True, ignore_index=True) return matches
def sameas_linker( df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, result_filter=None, uri_data_model=False, bundled_mode=True, prefix_lookup=False, caching=True): """Function that takes URIs from a column of a DataFrame and queries a given SPARQL endpoint for ressources which are connected to these URIs via owl:sameAs. Found ressources are added as new columns to the dataframe and the dataframe is returned. Args: df (pd.DataFrame): Dataframe to which links are added. column (str): Name of the column for whose entities links should be found. new_attribute_name (str, optional): Name / prefix of the column(s) containing the found links. Defaults to "new_link". progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. result_filter (list, optional): A list filled with regexes (as strings) to filter the results. Defaults to None. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. bundled_mode (bool, optional): If True, all necessary queries are boundled into one querie (using the VALUES method). - Requires a SPARQL 1.1 implementation!. Defaults to True. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Returns dataframe with (a) new column(s) containing the found ressources. """ df = df.copy() if bundled_mode and not uri_data_model: values = " ( <"+df[column].str.cat(sep="> ) ( <")+"> ) " query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {" + \ values+"} ?value owl:sameAs ?sameas_uris . " if result_filter != None: query = query + \ "FILTER("+regex_string_generator("?sameas_uris", result_filter)+") " query = query+"}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates() else: result_df = pd.DataFrame() if uri_data_model: query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {(<**URI**>)} ?value owl:sameAs ?sameas_uris . " if result_filter != None: query = query + \ "FILTER("+regex_string_generator("str(?sameas_uris)", result_filter)+") " query = query+"}" result_df = uri_querier( df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) else: if progress: iterator = tqdm(df[column].iteritems(), total=df.shape[0]) else: iterator = df[column].iteritems() for uri in iterator: if pd.isna(uri[1]): pass else: query = " SELECT DISTINCT ?value ?sameas_uris WHERE {?value owl:sameAs ?sameas_uris. FILTER (?value = <"+uri[ 1]+">" if result_filter != None: query = query + \ " && ("+regex_string_generator("?sameas_uris", result_filter)+")" query = query+") }" result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) result_df = result_df.rename( {"callret-0": "value"}, axis="columns").drop_duplicates().reset_index(drop=True) if result_df.empty: df[new_attribute_name+"_1"] = np.nan return df else: result_df_grouped = result_df.groupby("value") result_df_grouped = result_df_grouped["sameas_uris"].apply( lambda x: pd.Series(x.values)).unstack() result_df_grouped = result_df_grouped.rename( columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])}) df = pd.merge(df, result_df_grouped, left_on=column, right_on="value", how="outer") return df
def string_similarity_matching(df, predicate="rdfs:label", to_lowercase=True, remove_prefixes=True, remove_punctuation=True, similarity_metric="norm_levenshtein", prefix_threshold=1, n=2, progress=True, caching=True): """Calculates the string similarity from the text field obtained by querying the attributes for the predicate, by default rdfs:label. Args: df (pd.DataFrame): Dataframe where matching attributes are supposed to be found predicate (str, optional): Defaults to "rdfs:label". to_lowercase (bool, optional): converts queried strings to lowercase. Defaults to True. remove_prefixes (bool, optional): removes prefices of queried strings. Defaults to True. remove_punctuation (bool, optional): removes punctuation from queried strings. Defaults to True. similarity_metric (str, optional): norm by which strings are compared. Defaults to "norm_levenshtein". prefix_threshold (int, optional): The number of occurences after which a prefix is considered "common". defaults to 1. n (int, optional): parameter for n-gram and Jaccard similarities. Defaults to 2. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Two columns with matching links and a third column with the string similarity score. """ # Get URIs from the column names cat_cols = [col for col in df.columns if re.findall("https*:", col)] cat_cols_stripped = [ re.sub(r"^.*http://", "http://", col) for col in cat_cols ] # Query these URIs for the predicate (usually the label) query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value " query += predicate + " ?o. FILTER (lang(?o) = 'en') }" labels = uri_querier(pd.DataFrame(cat_cols_stripped), 0, query, progress=progress, caching=caching).set_index("value") # Get common prefixes common_prefixes = get_common_prefixes(labels, prefix_threshold) # Clean the results (i.e. the labels) labels["o"] = labels["o"].apply(lambda x: clean_string( x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation)) # Create a dictionary that maps the URIs to their result (i.e. label) labels.reset_index(inplace=True) no_label = pd.DataFrame({ "value": [x for x in cat_cols_stripped if x not in list(labels["value"])], "o": np.nan }) labels = labels.append(no_label, ignore_index=True) labels_dict = labels.set_index("value").T.to_dict("list") #labels_dict = labels.to_dict(orient="index") # Create all unique combinations from the URIs, order them alphabetically # and turn them into a DataFrame combinations = list(itertools.combinations(labels_dict.keys(), 2)) combinations_sorted = [sorted(x) for x in combinations] result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) # For each combination in this DataFrame, calculate the string similarity # of their results (i.e. labels) if progress: tqdm.pandas( desc="String Similarity Matching: Calculate String Similarities") result["value_string"] = result.progress_apply( lambda x: calc_string_similarity(x["uri_1"], x["uri_2"], labels_dict, metric=similarity_metric, n=n), axis=1) else: result["value_string"] = result.apply(lambda x: calc_string_similarity( x["uri_1"], x["uri_2" ], labels_dict, metric=similarity_metric, n=n), axis=1) return result
def relational_matching(df, endpoints=[DBpedia, WikiData], uri_data_model=False, match_score=1, progress=True, caching=True): """Creates a mapping of matching attributes in the schema by checking for owl:sameAs, owl:equivalentClass, owl:Equivalent and wdt:P1628 links between them. Args: df (pd.DataFrame): Dataframe where matching attributes are supposed to be found. endpoints (list, optional): SPARQL Endpoint to be queried. Defaults to [DBpedia, WikiData]. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. match_score (int, optional): Score of the match: 0 < match_score <= 1. Defaults to 1. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Two columns with matching links and a third column with the score, which is always one in case of the relational matching unless specified otherwise. """ matches = pd.DataFrame(columns=["uri_1", "uri_2", "value"]) # determine attribute columns cat_cols = [col for col in df.columns if re.findall("http:", col)] cat_cols_stripped = [ re.sub(r"^.*http://", "http://", col) for col in cat_cols ] if not cat_cols: return matches # transform attributes to sparql values list form values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) " if uri_data_model: # formulate query query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/>" query += "SELECT ?value ?object WHERE {VALUES (?value) { (<**URI**>)}" query += " ?value\ (owl:equivalentProperty|owl:equivalentClass|owl:sameAs|wdt:P1628)\ ?object. }" temp_df = pd.DataFrame(cat_cols_stripped, columns=["values"]) same_cats = uri_querier(temp_df, "values", query, caching=caching, progress=progress) if same_cats.empty: return matches else: same_cats = same_cats.drop( same_cats[same_cats["value"] == same_cats["object"]].index) else: if not isinstance(endpoints, list): endpoints = [endpoints] same_cats = pd.DataFrame(columns=["value", "object"]) for endpoint in endpoints: # formulate query query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/>" query += "SELECT ?value ?object WHERE {VALUES (?value) {" query += values query += "} ?value\ (owl:equivalentProperty|owl:equivalentClass|owl:sameAs|wdt:P1628)\ ?object. }" # query the equivalent classes/properties query_result = endpoint_wrapper(query, endpoint, caching=caching) if not query_result.empty: query_result = query_result.drop_duplicates().\ reset_index(drop=True) # group equivalent classes/properties for each original attribute same_cats = same_cats.append(query_result, ignore_index=True) if same_cats.empty: return matches combinations = list(itertools.combinations(cat_cols_stripped, 2)) combinations_sorted = pd.DataFrame([sorted(x) for x in combinations], columns=["uri_1", "uri_2"]) # detect matches in the attributes for _, row in same_cats.iterrows(): if row["object"] in cat_cols_stripped: # if there is a match insert it in alphabetical order into the # output matches dataframe new_match = { "uri_1": min(row["value"], row["object"]), "uri_2": max(row["value"], row["object"]), "value": match_score } matches = matches.append(new_match, ignore_index=True) matches = matches.drop_duplicates() full_matches = combinations_sorted.merge(matches, on=["uri_1", "uri_2"], how="outer") full_matches["value"] = np.where(full_matches["value"].isna(), 0, full_matches["value"]) return full_matches
def qualified_relation_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="Link", direction="Out", properties_regex_filter=None, types_regex_filter=None, result_type="boolean", hierarchy=False, prefix_lookup=False, caching=True): """Qualified relation generator considers not only relations, but also the related types, adding boolean, counts, relative counts or tfidf-values features for incoming and outgoing relations. Args: df (pd.DataFrame): Dataframe to which links are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defauls to "Link". direction (str, optional): The direction for properties which choose from Incoming, Outgoing (In and Out). Defaults to "Out". properties_regex_filter (str, optional): Regular expression for filtering properties. Defaults to None. types_regex_filter (str, optional): Regular expression for filtering types. Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") Defaults to "boolean". hierarchy (bool, optional): If True, a hierarchy of all superclasses of the returned types is attached to the resulting dataframe. Defaults to False. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph """ df = df.copy() if hierarchy: hierarchyGraph = nx.DiGraph() #convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] #iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) " if direction == "Out": query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {" + values + "} ?value ?p ?o. ?o rdf:type ?type. " elif direction == "In": query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {" + values + "} ?s ?p ?value. ?s rdf:type ?type. " if properties_regex_filter != None: regex_string = regex_string_generator("?p", properties_regex_filter) query = query + "FILTER(" + regex_string + ") " if types_regex_filter != None: regex_string = regex_string_generator("?type", types_regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: if direction == "Out": query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?o. ?o rdf:type ?type. " elif direction == "In": query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {(<**URI**>)} ?s ?p ?value. ?s rdf:type ?type. " if properties_regex_filter != None: regex_string = regex_string_generator("str(?p)", properties_regex_filter) query = query + "FILTER(" + regex_string + ") " if types_regex_filter != None: regex_string = regex_string_generator("str(?type)", types_regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) if type(result_df) != type(pd.DataFrame()): pass if result_df.empty: pass else: if hierarchy: hierarchy_col = hierarchy_graph_generator( result_df["type"], hierarchy_relation= "http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col) result_df[ "link_with_type"] = result_df["p"] + "_type_" + result_df["type"] result_df = result_df[["value", "link_with_type"]] result_df_dummies = result_df.join( result_df["link_with_type"].str.get_dummies()).drop( "link_with_type", axis=1) result_df = get_result_df( result_df_dummies, result_type, prefix + "_" + direction + "_" + result_type + "_", df, columns) if hierarchy: # append hierarchy to df as attribute, this will generate a warning but works result_df.attrs = {"hierarchy": hierarchyGraph} return result_df
def specific_relation_generator( df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, direct_relation="http://purl.org/dc/terms/subject", hierarchy_relation=None, max_hierarchy_depth=1, prefix_lookup=False, caching=True): """Creates attributes from a specific direct relation. Additionally, it is possible to append a hierarchy with a user-defined hierarchy relation. Args: df (pd.DataFrame): the dataframe to extend columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. direct_relation (str, optional): Direct relation used to create features. Defaults to "http://purl.org/dc/terms/subject". hierarchy_relation (str, optional): Hierarchy relation used to connect categories, e.g. http://www.w3.org/2004/02/skos/core#broader. Defaults to None. max_hierarchy_depth (int, optional): Maximal number of hierarchy steps taken. Defaults to 1. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: The dataframe with additional features. """ df = df.copy() if hierarchy_relation: hierarchy_relation = re.sub(r"^.*?https://", "http://", hierarchy_relation) hierarchy = nx.DiGraph() direct_relation = re.sub(r"^.*?https://", "http://", direct_relation) # convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] if df[columns].isna().all().item(): return df # iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: # Create Sparql Query values = "(<" + df[col].str.cat(sep=">) (<") + ">) " query = "SELECT ?value ?object " query += " WHERE {VALUES (?value) {" + values query += "} ?value (<" + direct_relation + ">) ?object. }" # Retrieve query results from endpoint query_result = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).\ drop_duplicates().reset_index(drop=True) else: # Create URI Query query = "SELECT ?value ?object WHERE {VALUES (?value) {(<**URI**>)}" query += " ?value (<" + direct_relation + ">) ?object. }" query_result = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) # delete empty columns (for example when hierarchy relation returns # nothing) query_result = query_result.dropna(how="all", axis=1) # check if there are valid results, if not return the original frame if query_result.empty: continue # extract hierarchy if hierarchy_relation: hierarchy_col = hierarchy_graph_generator( query_result["object"], hierarchy_relation=hierarchy_relation, max_hierarchy_depth=max_hierarchy_depth, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchy = nx.compose(hierarchy, hierarchy_col) query_grouped = query_result.groupby("value")["object"].apply(list) # bundle the unique new features new_cols = pd.Series(query_grouped.values.sum()).unique() # create shape of result dataframe to fill df_to_append = pd.DataFrame(columns=new_cols) df_to_append["value"] = query_grouped.index # check for each URI if it belongs to the category and tick True/False for row, new_col in itertools.product(df_to_append.index, new_cols): df_to_append.loc[row, new_col] = np.where( new_col in query_grouped[df_to_append.loc[row, "value"]], True, False).item() # merge the new column with the original dataframe df_to_append.rename({"value": col}, axis=1, inplace=True) df = pd.merge(df, df_to_append, how="left", on=col) # rename columns if new_cols.any(): df.columns = [ col + "_in_boolean_" + name if name in new_cols else name for name in df.columns ] # append hierarchy to df as attribute, this will generate a warning but # works if hierarchy_relation: df.attrs = {"hierarchy": hierarchy} return df
def unqualified_relation_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="Link", direction="Out", regex_filter=None, result_type="boolean", prefix_lookup=False, caching=True): """Unqualified relation generator creates attributes from the existence of relations and adds boolean, counts, relative counts or tfidf-values features for incoming and outgoing relations. Args: df (pd.DataFrame): Dataframe to which links are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defauls to "Link". direction (str, optional): The direction for properties which choose from Incoming, Outgoing (In and Out). Defaults to "Out". regex_filter (str, optional): Regular expression for filtering properties. Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") Defaults to "boolean". prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph """ df = df.copy() #convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] #iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) " if direction == "Out": query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) {" + values + "} ?value ?p ?o " elif direction == "In": query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) {" + values + "} ?s ?p ?value " if regex_filter != None: regex_string = regex_string_generator("?p", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: if direction == "Out": query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) { (<**URI**>)} ?value ?p ?o " elif direction == "In": query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) { (<**URI**>)} ?s ?p ?value " if regex_filter != None: regex_string = regex_string_generator("str(?p)", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) if type(result_df) != type(pd.DataFrame()): pass if result_df.empty: pass else: result_df_dummies = result_df.join( result_df["p"].str.get_dummies()).drop("p", axis=1) result_df = get_result_df( result_df_dummies, result_type, prefix + "_" + direction + "_" + result_type + "_", df, columns) return result_df
def direct_type_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="", regex_filter=None, result_type="boolean", bundled_mode=True, hierarchy=False, prefix_lookup=False, caching=True): """Generator that takes a dataset with (a) link(s) to a knowledge graph and queries the type(s) of the linked ressources (using rdf:type). The resulting types are added as new columns, which are filled either with a boolean indicator or a count. Args: df (pd.DataFrame): Dataframe to which types are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL . Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process . Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defaults to "". regex_filter (list, optional): A list filled with regexes (as strings) to filter the results . Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") . Defaults to "boolean". bundled_mode (bool, optional): If True, all necessary queries are bundled into one query (using the VALUES method). - Requires a SPARQL 1.1 implementation! . Defaults to True. hierarchy (bool, optional): If True, a hierarchy of all superclasses of the returned types is attached to the resulting dataframe. Defaults to False. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Returns dataframe with (a) new column(s) containing the found types. """ df = df.copy() final_result_df = pd.DataFrame() if hierarchy: hierarchyGraph = nx.DiGraph() # convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] # Create SPARQL query (based on rdf:type) for each user-specified column if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for column in iterator: # If bundled_mode is selected all necessary queries for a column are bundled into one query (using the VALUES method). -> Way faster But less compatible. if bundled_mode and not uri_data_model: values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) " query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {" + \ values+"} ?value rdf:type ?types . " if regex_filter != None: regex_string = regex_string_generator("?types", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: result_df = pd.DataFrame() if uri_data_model: query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {(<**URI**>)} ?value rdf:type ?types . " if regex_filter != None: regex_string = regex_string_generator( "str(?types)", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) else: for uri in df[column].iteritems(): if pd.notna(uri[1]): query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {?value rdf:type ?types . FILTER (?value = <" + \ uri[1]+">" if regex_filter != None: query = query + " && (" + regex_string_generator( "?types", regex_filter) + ")" query = query + ") }" result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) else: pass result_df = result_df.rename( { "callret-0": "value" }, axis="columns").drop_duplicates().reset_index(drop=True) if hierarchy: hierarchy_col = hierarchy_graph_generator( result_df["types"], hierarchy_relation= "http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col) if result_df.empty: result_columns = [] pass else: # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is one row result_df_dummies = result_df.join( result_df.types.str.get_dummies()).drop("types", axis=1) # Sparse dataframe is grouped by uri result_df_grouped = result_df_dummies.groupby("value").sum() # Result columns get prefix (format depends on single or multiple columns) if len(columns) > 1: result_df_grouped = result_df_grouped.add_prefix("type_") else: result_df_grouped = result_df_grouped.add_prefix(column + "_type_") # Results get concatenated to the queried columns (to be used as identifiers) (??) result_df_merged = pd.merge(df[columns], result_df_grouped, left_on=column, right_on="value", how="outer").drop_duplicates() # If multiple columns with URIs are looked up: Current results are merged with the results of previous passes of the loop final_result_df = pd.concat([final_result_df, result_df_merged], sort=False).groupby( columns, dropna=False).sum().reset_index() # Result columns are determined and converted to the correct dtype result_columns = list( set(list(final_result_df.columns)) - set(columns)) final_result_df[result_columns] = final_result_df[ result_columns].astype("int64") if not final_result_df.empty: # If result_type is boolean, all values greater 0 are changed to True all others to False if result_type == "boolean": final_result_df[result_columns] = final_result_df[ result_columns].astype("bool") # If result_type is "relative" or "tfidf", calculate the relative counts per row elif result_type in ["relative", "tfidf"]: # Calculate the relative counts by dividing each row by its sum, fillna(0) to replace missings created by division by zero (when sum=0) final_result_df_relative = final_result_df.copy() final_result_df_relative[result_columns] = final_result_df[ result_columns].div( final_result_df[result_columns].sum(axis=1), axis=0).fillna(0) # If result_type is "tfidf", use the table of relative counts to create the table of tfidf-values if result_type == "tfidf": # Calculate idf values N = len(final_result_df[result_columns]) nt = final_result_df[result_columns][ final_result_df[result_columns] >= 1].count(axis=0) idf = np.log(N / nt).replace(np.inf, 0) # Multiply relative counts with idf values final_result_df_relative[ result_columns] = final_result_df_relative[ result_columns].multiply(idf, axis="columns") final_result_df = final_result_df_relative.copy() # Collected query-results get appended to the original dataframe df = pd.merge(df, final_result_df, on=columns, how="outer") if hierarchy: df.attrs = {"hierarchy": hierarchyGraph} return df
def data_properties_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, type_filter=None, regex_filter=None, bundled_mode=True, prefix_lookup=False, caching=True): """Generator that takes a dataset with a link to a knowledge graph and creates a new feature for each data property of the given resource. Args: df (pd.DataFrame): Dataframe to which the features will be added columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): Base string to the knowledge graph; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. type_filter (str, optional): Property datatype to be selected from results (e.g. xsd:string). If a specific datatype should be excluded a "- " needs to be prepended (e.g. - xsd:string). Defaults to None. regex_filter (str, optional): Regular expression for filtering properties. Defaults to None. bundled_mode (bool, optional): If True, all necessary queries are bundled into one query (using the VALUES method). - Requires a SPARQL 1.1 implementation! . Defaults to True. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with a new column for each property. """ df = df.copy() # convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] # Prepare Type Filter Statement (Decode Include/Exclude) if type_filter != None: if type_filter[0:2] == "- ": type_filter_str = " && DATATYPE(?v) != " + type_filter[2:] else: type_filter_str = " && DATATYPE(?v) = " + type_filter # Create SPARQL query for each user-specified column if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if bundled_mode and not uri_data_model: values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) " query = "SELECT ?value ?p ?v WHERE {VALUES (?value) {" + \ values + "} ?value ?p ?v FILTER(isLITERAL(?v)" if type_filter != None: query = query + type_filter_str if regex_filter != None: query = query + " && regex(?p, \"" + regex_filter + "\")" query = query + ")}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: result_df = pd.DataFrame() if uri_data_model: query = "SELECT DISTINCT ?value ?p ?v WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?v FILTER(isLITERAL(?v)" if type_filter != None: query = query + type_filter_str if regex_filter != None: query = query + " && regex(?p, \"" + regex_filter + "\")" query = query + ")}" result_df = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) else: for uri in df[col].iteritems(): if pd.notna(uri[1]): query = "SELECT DISTINCT ?value ?p ?v WHERE {?value ?p ?v . FILTER (?value = <" + \ uri[1]+"> && (isLITERAL(?v))" if type_filter != None: query = query + type_filter_str if regex_filter != None: query = query + " && regex(?p, \"" + regex_filter + "\")" query = query + ")} " result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) else: pass if result_df.empty: pass else: # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is on row result_df["p"] = col + "_data_" + result_df["p"] # transform values into new columns result_df = result_df.pivot_table(values="v", index="value", columns="p", aggfunc=np.random.choice) # append properties to dataframe df = pd.merge(df, result_df, how="left", left_on=col, right_on="value") return df
def check_uri_redirects( df, column, replace=True, custom_name_postfix=None, redirection_property="http://dbpedia.org/ontology/wikiPageRedirects", endpoint=DBpedia, regex_filter="dbpedia", bundled_mode=True, uri_data_model=False, progress=True, caching=True): """Takes a column of URIs from a DataFrame and checks for each if it has a redirection set by the endpoint. If this is the case, the URI it redirects to is either added in a new column or replaces the original URI. Args: df (pd.DataFrame): Dataframe for which the URIs should be inspected. column (str): Name of the column that contains the URIs that should be checked. replace (bool, optional): If True: URIs that get redirected will be replaced with the new URI; If False: A new column, containing the result for each URI, is added to the DataFrame. Defaults to True. custom_name_postfix (str, optional): Custom postfix for the newly created column (in case "replace" is set to False). Defaults to None. redirection_property (str, optional): Relation/Property URI that signals a redirect for this endpoint. Defaults to "http://dbpedia.org/ontology/wikiPageRedirects". endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. regex_filter (str, optional): Just URIs matching the specified RegEx are checked for redirects. Defaults to "dbpedia". bundled_mode (bool, optional): If True, all necessary queries are bundled into one query (using the VALUES method). - Requires a SPARQL 1.1 implementation!; ignored when "uri_data_model" = True. Defaults to True. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Raises: ValueError: Raised if 'custom_name_postfix' is set to "" instead of None. Returns: pd.DataFrame: Returns dataframe with cleaned links / a new column. """ if custom_name_postfix == "": raise ValueError( "'custom_name_postfix' can't be an empty string. If you don't want to use a custom_name_postfix, please set the attribute to None" ) df = df.copy() if bundled_mode and not uri_data_model: values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) " query = "SELECT DISTINCT ?value ?redirect WHERE {VALUES (?value) {" + values + "} ?value <" + redirection_property + "> ?redirect . }" result_df = endpoint_wrapper( query, endpoint, caching=caching).drop_duplicates().reset_index(drop=True) else: result_df = pd.DataFrame() if uri_data_model: query = "SELECT DISTINCT ?value ?redirect WHERE {VALUES (?value) {(<**URI**>)} ?value <" + redirection_property + "> ?redirect . }" result_df = uri_querier(df, column, query, regex_filter=regex_filter, progress=progress, caching=caching) else: for uri in df[column].iteritems(): if pd.notna(uri[1]): query = "SELECT DISTINCT ?value ?redirect WHERE {?value <" + redirection_property + "> ?redirect . FILTER (?value = <" + uri[ 1] + ">) }" result = endpoint_wrapper(query, endpoint, caching=caching) result_df = result_df.append(result) else: pass result_df = result_df.rename({ "callret-0": "value" }, axis="columns").drop_duplicates().reset_index(drop=True) if result_df.empty: return df else: if custom_name_postfix == None: new_attribute_name = column + "_redirect" else: new_attribute_name = column + custom_name_postfix result_df = pd.merge(df, result_df, how="left", left_on=column, right_on="value").drop("value", axis=1).rename( columns={"redirect": new_attribute_name}) if replace: result_df.loc[(pd.isnull(result_df[new_attribute_name])), new_attribute_name] = result_df[column] result_df.drop(column, axis=1, inplace=True) result_df.rename(columns={new_attribute_name: column}, inplace=True) return result_df
def link_explorer(df, base_link_column, number_of_hops = 1, links_to_follow = ["owl:sameAs"], lod_sources = [], exclude_sources = [], prefix_lookup=False, progress = True, caching=True): """Follows the defined links starting from a base link to a certain number of hops. Adds the discovered links as new columns to the dataframe. Args: df (pd.DataFrame): Dataframe with a base link base_link_column (str): Name of column which contains the base link to start with. number_of_hops (int, optional): Depth of exlporation of the LOD cloud. Defaults to 1. links_to_follow (list, optional): Names of links that should be followed. Defaults to "owl:sameAs". lod_sources (list, optional): Restrict exploration to certain datasets. Use strings or regular expressions to define the allowed datasets. Defaults to []. exclude_sources (list, optional): Exclude certain datasets from exploration. Use strings or regular expressions to define the datasets. Defaults to []. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with a new column for each discovered link. """ if not isinstance(links_to_follow, list): links_to_follow = [links_to_follow] if not isinstance(exclude_sources, list): exclude_sources = [exclude_sources] if not isinstance(lod_sources, list): lod_sources = [lod_sources] all_links = list(df[base_link_column]) query_raw = " SELECT DISTINCT ?value ?uri{} WHERE {{VALUES (?value) {{(<**URI**>)}} ?value " + "|".join(links_to_follow) + " ?uri{} }} " df_merged = pd.DataFrame() df_all = pd.DataFrame() if progress: iterator = tqdm( range(1,number_of_hops+1), desc="Link Explorer - Performing Hops.") else: iterator = range(1,number_of_hops+1) for hop in iterator: query = query_raw.format(str(hop),str(hop)) if hop == 1: df_result = uri_querier(df, base_link_column, query, prefix_lookup=prefix_lookup, caching=caching, progress=progress) else: df_result = uri_querier(df_result, "uri"+str(hop-1), query, prefix_lookup=prefix_lookup, caching=caching, progress=progress) if df_result.empty: break # eliminate duplicate links df_result = df_result[~df_result["uri"+str(hop)].isin(all_links)] # filter sources if lod_sources: df_result = df_result[df_result["uri"+str(hop)].str.contains("|".join(lod_sources))] # exclude certain sources defined by string or regex if exclude_sources: df_result = df_result[~df_result["uri"+str(hop)].str.contains("|".join(exclude_sources))] if df_result.empty: break if df_merged.empty: df_merged = df_result else: df_merged = pd.merge(df_merged, df_result, left_on="uri"+str(hop-1), right_on="value", how="left", suffixes=("", "_y")).drop("value_y",axis=1) df_all = df_all.append(df_merged[["value","uri"+str(hop)]].rename(columns={"uri"+str(hop) : "uri"})) df_all = df_all.dropna().drop_duplicates() all_links += df_result["uri"+str(hop)].tolist() if df_all.empty: return df df_all["count"] = np.nan regex_pattern = "^http:/" while True: regex_pattern += "/[^/]*" df_all["pld"] = df_all.apply( lambda x: x["pld"] if x["count"] == 1 else re.search(r"{}".format(regex_pattern), x["uri"]).group(), axis=1) df_all = df_all.drop("count", axis=1) df_with_counts = df_all.groupby(["value","pld"]).size().reset_index(name="count") df_all = pd.merge(df_all, df_with_counts, left_on=["value","pld"], right_on=["value","pld"]) #break loop when all counts are 1 if (df_all["count"] == 1).all(): break df_pivot = df_all.pivot_table(values="uri", index="value", columns="pld", aggfunc="first").reset_index() df_final = pd.merge(df, df_pivot, left_on=base_link_column, right_on="value", how="outer").drop("value",axis=1) return df_final