def test11_subclass_with_max_depth_uri(self): input_df = pd.DataFrame({ 'value': ['http://dbpedia.org/resource/Explorair'] * 3 + ['http://dbpedia.org/resource/Buxton_Watermill'] * 2, 'types': ['http://www.w3.org/2002/07/owl#Thing', 'http://dbpedia.org/ontology/Company', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent', 'http://www.w3.org/2002/07/owl#Thing', 'http://dbpedia.org/ontology/Place'] }) expected_DG = nx.DiGraph() expected_nodes = [ 'http://dbpedia.org/ontology/Company', 'http://dbpedia.org/ontology/Organisation', 'http://dbpedia.org/ontology/Place', 'http://www.w3.org/2002/07/owl#Thing', 'http://dbpedia.org/ontology/Agent'] expected_edges = [ ('http://dbpedia.org/ontology/Company', 'http://dbpedia.org/ontology/Organisation'), ('http://dbpedia.org/ontology/Organisation', 'http://dbpedia.org/ontology/Agent'), ('http://dbpedia.org/ontology/Place', 'http://www.w3.org/2002/07/owl#Thing'), ('http://dbpedia.org/ontology/Agent', 'http://www.w3.org/2002/07/owl#Thing')] expected_DG.add_nodes_from(expected_nodes) expected_DG.add_edges_from(expected_edges) output_DG = hierarchy_graph_generator(input_df['types'], max_hierarchy_depth=2, uri_data_model=True) assert nx.is_isomorphic(expected_DG, output_DG)
def test7_default_nan(self): input_df = pd.DataFrame({ 'value': ['http://dbpedia.org/resource/Explorair'] * 3 + ['http://dbpedia.org/resource/Buxton_Watermill'] * 2, 'types': ['http://www.w3.org/2002/07/owl#Thing', np.nan, 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent', np.nan, 'http://dbpedia.org/ontology/Place'] }) expected_DG = nx.DiGraph() expected_nodes = ['http://dbpedia.org/ontology/Place', 'http://www.w3.org/2002/07/owl#Thing', 'http://umbel.org/umbel/rc/Place', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent'] expected_edges = [ ('http://dbpedia.org/ontology/Place', 'http://www.w3.org/2002/07/owl#Thing'), ('http://dbpedia.org/ontology/Place', 'http://umbel.org/umbel/rc/Place')] expected_DG.add_nodes_from(expected_nodes) expected_DG.add_edges_from(expected_edges) output_DG = hierarchy_graph_generator(input_df['types']) assert nx.is_isomorphic(expected_DG, output_DG)
def test18_skos_all_nan_uri(self): input_df = pd.DataFrame({ 'category': [np.nan, np.nan] }) expected_DG = nx.DiGraph() output_DG = hierarchy_graph_generator(input_df['category'], hierarchy_relation="http://www.w3.org/2004/02/skos/core#broader", max_hierarchy_depth=1, uri_data_model=True) assert nx.is_isomorphic(expected_DG, output_DG)
def test15_invalid_hierarchy_relation_uri(self): input_df = pd.DataFrame({ 'category': ['http://dbpedia.org/resource/Category:1913', 'http://dbpedia.org/resource/Category:2020'] }) expected_DG = nx.DiGraph() expected_nodes = ['http://dbpedia.org/resource/Category:1913', 'http://dbpedia.org/resource/Category:2020'] expected_DG.add_nodes_from(expected_nodes) output_DG = hierarchy_graph_generator(input_df['category'], hierarchy_relation="http://www.w3.org/2004/02/skos/ballaballa", max_hierarchy_depth=None, uri_data_model=True) assert nx.is_isomorphic(expected_DG, output_DG)
def test17_skos_broader_nan_uri(self): input_df = pd.DataFrame({ 'category': [np.nan, 'http://dbpedia.org/resource/Category:2020'] }) expected_DG = nx.DiGraph() expected_nodes = ['http://dbpedia.org/resource/Category:Years', 'http://dbpedia.org/resource/Category:2020', 'http://dbpedia.org/resource/Category:2020s', 'http://dbpedia.org/resource/Category:Years_in_the_future'] expected_edges = [('http://dbpedia.org/resource/Category:2020', 'http://dbpedia.org/resource/Category:2020s'), ('http://dbpedia.org/resource/Category:2020', 'http://dbpedia.org/resource/Category:Years'), ('http://dbpedia.org/resource/Category:2020', 'http://dbpedia.org/resource/Category:Years_in_the_future')] expected_DG.add_nodes_from(expected_nodes) expected_DG.add_edges_from(expected_edges) output_DG = hierarchy_graph_generator(input_df['category'], hierarchy_relation="http://www.w3.org/2004/02/skos/core#broader", max_hierarchy_depth=1, uri_data_model=True) assert nx.is_isomorphic(expected_DG, output_DG)
def test1_default_behaviour(self): input_df = pd.DataFrame({ 'value': ['http://dbpedia.org/resource/Explorair'] * 3 + ['http://dbpedia.org/resource/Buxton_Watermill'] * 2, 'types': ['http://www.w3.org/2002/07/owl#Thing', 'http://dbpedia.org/ontology/Company', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent', 'http://www.w3.org/2002/07/owl#Thing', 'http://dbpedia.org/ontology/Place'] }) expected_DG = nx.DiGraph() expected_nodes = ['http://dbpedia.org/ontology/Company', 'http://purl.org/goodrelations/v1#BusinessEntity', 'http://dbpedia.org/ontology/Organisation', 'http://umbel.org/umbel/rc/Business', 'http://dbpedia.org/ontology/Place', 'http://www.w3.org/2002/07/owl#Thing', 'http://umbel.org/umbel/rc/Place', 'http://www.ontologydesignpatterns.org/ont/dul/DUL.owl#Agent', 'http://xmlns.com/foaf/0.1/Organization', 'http://www.openlinksw.com/virtpivot/icons/Business', 'http://dbpedia.org/ontology/Agent', 'http://umbel.org/umbel/rc/Organization', 'http://www.openlinksw.com/virtpivot/icons/Organization', 'http://umbel.org/umbel/rc/Agent-Generic'] expected_edges = [('http://dbpedia.org/ontology/Company', 'http://purl.org/goodrelations/v1#BusinessEntity'), ('http://dbpedia.org/ontology/Company', 'http://dbpedia.org/ontology/Organisation'), ('http://dbpedia.org/ontology/Company', 'http://umbel.org/umbel/rc/Business'), ('http://purl.org/goodrelations/v1#BusinessEntity', 'http://xmlns.com/foaf/0.1/Organization'), ('http://purl.org/goodrelations/v1#BusinessEntity', 'http://www.openlinksw.com/virtpivot/icons/Business'), ('http://dbpedia.org/ontology/Organisation', 'http://dbpedia.org/ontology/Agent'), ('http://dbpedia.org/ontology/Organisation', 'http://umbel.org/umbel/rc/Organization'), ('http://dbpedia.org/ontology/Place', 'http://www.w3.org/2002/07/owl#Thing'), ('http://dbpedia.org/ontology/Place', 'http://umbel.org/umbel/rc/Place'), ('http://xmlns.com/foaf/0.1/Organization', 'http://www.openlinksw.com/virtpivot/icons/Organization'), ('http://dbpedia.org/ontology/Agent', 'http://www.w3.org/2002/07/owl#Thing'), ('http://dbpedia.org/ontology/Agent', 'http://umbel.org/umbel/rc/Agent-Generic')] expected_DG.add_nodes_from(expected_nodes) expected_DG.add_edges_from(expected_edges) output_DG = hierarchy_graph_generator(input_df['types']) assert nx.is_isomorphic(expected_DG, output_DG)
def test13_skos_broader_max_2_uri(self): input_df = pd.DataFrame({ 'category': ['http://dbpedia.org/resource/Category:1913', 'http://dbpedia.org/resource/Category:2020'] }) expected_DG = nx.DiGraph() expected_nodes = ['http://dbpedia.org/resource/Category:1913', 'http://dbpedia.org/resource/Category:Years', 'http://dbpedia.org/resource/Category:Chronology', 'http://dbpedia.org/resource/Category:Units_of_time', 'http://dbpedia.org/resource/Category:2020', 'http://dbpedia.org/resource/Category:1910s', 'http://dbpedia.org/resource/Category:20th_century', 'http://dbpedia.org/resource/Category:Decades', 'http://dbpedia.org/resource/Category:Years_in_the_future', 'http://dbpedia.org/resource/Category:Time_periods_in_the_future', 'http://dbpedia.org/resource/Category:2020s', 'http://dbpedia.org/resource/Category:21st_century', 'http://dbpedia.org/resource/Category:Decades_in_the_future'] expected_edges = [('http://dbpedia.org/resource/Category:1913', 'http://dbpedia.org/resource/Category:Years'), ('http://dbpedia.org/resource/Category:1913', 'http://dbpedia.org/resource/Category:1910s'), ('http://dbpedia.org/resource/Category:Years', 'http://dbpedia.org/resource/Category:Chronology'), ('http://dbpedia.org/resource/Category:Years', 'http://dbpedia.org/resource/Category:Units_of_time'), ('http://dbpedia.org/resource/Category:2020', 'http://dbpedia.org/resource/Category:Years'), ('http://dbpedia.org/resource/Category:2020', 'http://dbpedia.org/resource/Category:Years_in_the_future'), ('http://dbpedia.org/resource/Category:2020', 'http://dbpedia.org/resource/Category:2020s'), ('http://dbpedia.org/resource/Category:1910s', 'http://dbpedia.org/resource/Category:20th_century'), ('http://dbpedia.org/resource/Category:1910s', 'http://dbpedia.org/resource/Category:Decades'), ('http://dbpedia.org/resource/Category:Years_in_the_future', 'http://dbpedia.org/resource/Category:Years'), ('http://dbpedia.org/resource/Category:Years_in_the_future', 'http://dbpedia.org/resource/Category:Time_periods_in_the_future'), ('http://dbpedia.org/resource/Category:2020s', 'http://dbpedia.org/resource/Category:21st_century'), ('http://dbpedia.org/resource/Category:2020s', 'http://dbpedia.org/resource/Category:Decades'), ('http://dbpedia.org/resource/Category:2020s', 'http://dbpedia.org/resource/Category:Decades_in_the_future')] expected_DG.add_nodes_from(expected_nodes) expected_DG.add_edges_from(expected_edges) output_DG = hierarchy_graph_generator(input_df['category'], hierarchy_relation="http://www.w3.org/2004/02/skos/core#broader", max_hierarchy_depth=2, uri_data_model=True) assert nx.is_isomorphic(expected_DG, output_DG)
def specific_relation_generator( df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, direct_relation="http://purl.org/dc/terms/subject", hierarchy_relation=None, max_hierarchy_depth=1, prefix_lookup=False, caching=True): """Creates attributes from a specific direct relation. Additionally, it is possible to append a hierarchy with a user-defined hierarchy relation. Args: df (pd.DataFrame): the dataframe to extend columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. direct_relation (str, optional): Direct relation used to create features. Defaults to "http://purl.org/dc/terms/subject". hierarchy_relation (str, optional): Hierarchy relation used to connect categories, e.g. http://www.w3.org/2004/02/skos/core#broader. Defaults to None. max_hierarchy_depth (int, optional): Maximal number of hierarchy steps taken. Defaults to 1. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: The dataframe with additional features. """ df = df.copy() if hierarchy_relation: hierarchy_relation = re.sub(r"^.*?https://", "http://", hierarchy_relation) hierarchy = nx.DiGraph() direct_relation = re.sub(r"^.*?https://", "http://", direct_relation) # convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] if df[columns].isna().all().item(): return df # iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: # Create Sparql Query values = "(<" + df[col].str.cat(sep=">) (<") + ">) " query = "SELECT ?value ?object " query += " WHERE {VALUES (?value) {" + values query += "} ?value (<" + direct_relation + ">) ?object. }" # Retrieve query results from endpoint query_result = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).\ drop_duplicates().reset_index(drop=True) else: # Create URI Query query = "SELECT ?value ?object WHERE {VALUES (?value) {(<**URI**>)}" query += " ?value (<" + direct_relation + ">) ?object. }" query_result = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) # delete empty columns (for example when hierarchy relation returns # nothing) query_result = query_result.dropna(how="all", axis=1) # check if there are valid results, if not return the original frame if query_result.empty: continue # extract hierarchy if hierarchy_relation: hierarchy_col = hierarchy_graph_generator( query_result["object"], hierarchy_relation=hierarchy_relation, max_hierarchy_depth=max_hierarchy_depth, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchy = nx.compose(hierarchy, hierarchy_col) query_grouped = query_result.groupby("value")["object"].apply(list) # bundle the unique new features new_cols = pd.Series(query_grouped.values.sum()).unique() # create shape of result dataframe to fill df_to_append = pd.DataFrame(columns=new_cols) df_to_append["value"] = query_grouped.index # check for each URI if it belongs to the category and tick True/False for row, new_col in itertools.product(df_to_append.index, new_cols): df_to_append.loc[row, new_col] = np.where( new_col in query_grouped[df_to_append.loc[row, "value"]], True, False).item() # merge the new column with the original dataframe df_to_append.rename({"value": col}, axis=1, inplace=True) df = pd.merge(df, df_to_append, how="left", on=col) # rename columns if new_cols.any(): df.columns = [ col + "_in_boolean_" + name if name in new_cols else name for name in df.columns ] # append hierarchy to df as attribute, this will generate a warning but # works if hierarchy_relation: df.attrs = {"hierarchy": hierarchy} return df
def qualified_relation_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="Link", direction="Out", properties_regex_filter=None, types_regex_filter=None, result_type="boolean", hierarchy=False, prefix_lookup=False, caching=True): """Qualified relation generator considers not only relations, but also the related types, adding boolean, counts, relative counts or tfidf-values features for incoming and outgoing relations. Args: df (pd.DataFrame): Dataframe to which links are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defauls to "Link". direction (str, optional): The direction for properties which choose from Incoming, Outgoing (In and Out). Defaults to "Out". properties_regex_filter (str, optional): Regular expression for filtering properties. Defaults to None. types_regex_filter (str, optional): Regular expression for filtering types. Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") Defaults to "boolean". hierarchy (bool, optional): If True, a hierarchy of all superclasses of the returned types is attached to the resulting dataframe. Defaults to False. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph """ df = df.copy() if hierarchy: hierarchyGraph = nx.DiGraph() #convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] #iterate over possibly several link columns if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for col in iterator: if not uri_data_model: values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) " if direction == "Out": query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {" + values + "} ?value ?p ?o. ?o rdf:type ?type. " elif direction == "In": query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {" + values + "} ?s ?p ?value. ?s rdf:type ?type. " if properties_regex_filter != None: regex_string = regex_string_generator("?p", properties_regex_filter) query = query + "FILTER(" + regex_string + ") " if types_regex_filter != None: regex_string = regex_string_generator("?type", types_regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: if direction == "Out": query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?o. ?o rdf:type ?type. " elif direction == "In": query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {(<**URI**>)} ?s ?p ?value. ?s rdf:type ?type. " if properties_regex_filter != None: regex_string = regex_string_generator("str(?p)", properties_regex_filter) query = query + "FILTER(" + regex_string + ") " if types_regex_filter != None: regex_string = regex_string_generator("str(?type)", types_regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, col, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) if type(result_df) != type(pd.DataFrame()): pass if result_df.empty: pass else: if hierarchy: hierarchy_col = hierarchy_graph_generator( result_df["type"], hierarchy_relation= "http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col) result_df[ "link_with_type"] = result_df["p"] + "_type_" + result_df["type"] result_df = result_df[["value", "link_with_type"]] result_df_dummies = result_df.join( result_df["link_with_type"].str.get_dummies()).drop( "link_with_type", axis=1) result_df = get_result_df( result_df_dummies, result_type, prefix + "_" + direction + "_" + result_type + "_", df, columns) if hierarchy: # append hierarchy to df as attribute, this will generate a warning but works result_df.attrs = {"hierarchy": hierarchyGraph} return result_df
def direct_type_generator(df, columns, endpoint=DBpedia, uri_data_model=False, progress=True, prefix="", regex_filter=None, result_type="boolean", bundled_mode=True, hierarchy=False, prefix_lookup=False, caching=True): """Generator that takes a dataset with (a) link(s) to a knowledge graph and queries the type(s) of the linked ressources (using rdf:type). The resulting types are added as new columns, which are filled either with a boolean indicator or a count. Args: df (pd.DataFrame): Dataframe to which types are added. columns (str/list): Name(s) of column(s) which contain(s) the link(s) to the knowledge graph. endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored when "uri_data_model" = True. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL . Defaults to False. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process . Defaults to True. prefix (str, optional): Custom prefix for the SPARQL query. Defaults to "". regex_filter (list, optional): A list filled with regexes (as strings) to filter the results . Defaults to None. result_type (str, optional): States wether the results should be boolean ("boolean"), counts ("counts"), relative counts ("relative") or tfidf-values ("tfidf") . Defaults to "boolean". bundled_mode (bool, optional): If True, all necessary queries are bundled into one query (using the VALUES method). - Requires a SPARQL 1.1 implementation! . Defaults to True. hierarchy (bool, optional): If True, a hierarchy of all superclasses of the returned types is attached to the resulting dataframe. Defaults to False. prefix_lookup (bool/str/dict, optional): True: Namespaces of prefixes will be looked up at prefix.cc and added to the sparql query. str: User provides the path to a json-file with prefixes and namespaces. dict: User provides a dictionary with prefixes and namespaces. Defaults to False. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Returns dataframe with (a) new column(s) containing the found types. """ df = df.copy() final_result_df = pd.DataFrame() if hierarchy: hierarchyGraph = nx.DiGraph() # convert columns to list to enable iteration if not isinstance(columns, list): columns = [columns] # Create SPARQL query (based on rdf:type) for each user-specified column if progress: iterator = tqdm(columns, desc="Column") else: iterator = columns for column in iterator: # If bundled_mode is selected all necessary queries for a column are bundled into one query (using the VALUES method). -> Way faster But less compatible. if bundled_mode and not uri_data_model: values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) " query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {" + \ values+"} ?value rdf:type ?types . " if regex_filter != None: regex_string = regex_string_generator("?types", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = endpoint_wrapper( query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates().reset_index(drop=True) else: result_df = pd.DataFrame() if uri_data_model: query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {(<**URI**>)} ?value rdf:type ?types . " if regex_filter != None: regex_string = regex_string_generator( "str(?types)", regex_filter) query = query + "FILTER(" + regex_string + ") " query = query + "}" result_df = uri_querier(df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching) else: for uri in df[column].iteritems(): if pd.notna(uri[1]): query = prefix + \ " SELECT DISTINCT ?value ?types WHERE {?value rdf:type ?types . FILTER (?value = <" + \ uri[1]+">" if regex_filter != None: query = query + " && (" + regex_string_generator( "?types", regex_filter) + ")" query = query + ") }" result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching) result_df = result_df.append(result) else: pass result_df = result_df.rename( { "callret-0": "value" }, axis="columns").drop_duplicates().reset_index(drop=True) if hierarchy: hierarchy_col = hierarchy_graph_generator( result_df["types"], hierarchy_relation= "http://www.w3.org/2000/01/rdf-schema#subClassOf", max_hierarchy_depth=None, endpoint=endpoint, uri_data_model=uri_data_model, progress=progress, caching=caching) hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col) if result_df.empty: result_columns = [] pass else: # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is one row result_df_dummies = result_df.join( result_df.types.str.get_dummies()).drop("types", axis=1) # Sparse dataframe is grouped by uri result_df_grouped = result_df_dummies.groupby("value").sum() # Result columns get prefix (format depends on single or multiple columns) if len(columns) > 1: result_df_grouped = result_df_grouped.add_prefix("type_") else: result_df_grouped = result_df_grouped.add_prefix(column + "_type_") # Results get concatenated to the queried columns (to be used as identifiers) (??) result_df_merged = pd.merge(df[columns], result_df_grouped, left_on=column, right_on="value", how="outer").drop_duplicates() # If multiple columns with URIs are looked up: Current results are merged with the results of previous passes of the loop final_result_df = pd.concat([final_result_df, result_df_merged], sort=False).groupby( columns, dropna=False).sum().reset_index() # Result columns are determined and converted to the correct dtype result_columns = list( set(list(final_result_df.columns)) - set(columns)) final_result_df[result_columns] = final_result_df[ result_columns].astype("int64") if not final_result_df.empty: # If result_type is boolean, all values greater 0 are changed to True all others to False if result_type == "boolean": final_result_df[result_columns] = final_result_df[ result_columns].astype("bool") # If result_type is "relative" or "tfidf", calculate the relative counts per row elif result_type in ["relative", "tfidf"]: # Calculate the relative counts by dividing each row by its sum, fillna(0) to replace missings created by division by zero (when sum=0) final_result_df_relative = final_result_df.copy() final_result_df_relative[result_columns] = final_result_df[ result_columns].div( final_result_df[result_columns].sum(axis=1), axis=0).fillna(0) # If result_type is "tfidf", use the table of relative counts to create the table of tfidf-values if result_type == "tfidf": # Calculate idf values N = len(final_result_df[result_columns]) nt = final_result_df[result_columns][ final_result_df[result_columns] >= 1].count(axis=0) idf = np.log(N / nt).replace(np.inf, 0) # Multiply relative counts with idf values final_result_df_relative[ result_columns] = final_result_df_relative[ result_columns].multiply(idf, axis="columns") final_result_df = final_result_df_relative.copy() # Collected query-results get appended to the original dataframe df = pd.merge(df, final_result_df, on=columns, how="outer") if hierarchy: df.attrs = {"hierarchy": hierarchyGraph} return df