def get_label_uri(uri): # return 'debug' r = query_sparql('select ?l where {' + uri + ' rdfs:label ?l . FILTER(langMatches(lang(?l),"en")) }') if len(r) > 0: return r[0]['l']['value'] else: r = query_sparql( 'select ?l where {' + uri + ' foaf:name ?l . FILTER(langMatches(lang(?l),"en")) }') if len(r) > 0: return r[0]['l']['value'] else: raise ValueError("cannot find uri label")
def identify_city_uri_attribute(uri, attribute): """Retrieves the uris of the cities associated with a dbpedia page and an attribute contained in that page We simply consider city everything that is not a country. So we: 1) Start from a page 2) Retrieve all the pages that page disambiguates or redirect, recursively 3) Filter out state and redirect and disambiguate pages Arguments: uri {str} -- attribute {str} -- Returns: list -- uris of the pages of the country """ if ask_sparql("ask { " + uri + " " + attribute + " ?s }"): query = "select distinct ?f where{\ " + uri + " " + attribute + " ?s .\ ?s (dbo:wikiPageRedirects|dbo:wikiPageDisambiguates)* ?f .\ filter not exists { ?f a dbo:Country } .\ filter not exists { ?f dct:subject dbc:States_of_the_United_States } .\ filter not exists { ?f dbo:wikiPageDisambiguates ?w } .\ filter not exists { ?f dbo:wikiPageRedirects ?w } .\ }" results = query_sparql(query) ret = ['<' + r['f']['value'] + '>' for r in results] if len(ret) > 0: return ret else: logging.getLogger('root.features').warning( f"{uri} had a valid {attribute} value, but I was not able to associate a city to it." )
def get_disambiguating_uri(uri): """given the uri, if that is a disambiguation page, returns the a list of uris to which it disambiguates to, otherwise it returns a void list Arguments: uri {str} -- Returns: list -- """ q = "SELECT ?s WHERE{" + uri + "dbo:wikiPageDisambiguates ?s . }" results = query_sparql(q) disambiguating_uri = [f"<{r['s']['value']}>" for r in results] return disambiguating_uri
def get_redirecting_uri(uri): """if this uri redirects to something, then return the redirecting uri, otherwise return the previous uri Arguments: uri {str} -- """ q = "SELECT ?s WHERE{" + uri + " dbo:wikiPageRedirects ?s . }" results = query_sparql(q) if len(results) > 0: assert len( results ) == 1, f"The uri {uri} has more that two redirecting pages, behaviour not expected!" redirecting_uri = f"<{results[0]['s']['value']}>" return redirecting_uri else: return uri
def identify_country_uri_attribute(uri, attribute): """Retrieve the dbpedia pages of the countries associated with a dbpedia page and an attribute contained in that page. The attribute can represent a general location. The feature checks if its value or the value of one of its parents represent a Country or not. eg: United Kingdom, Italy, Ireland .. Arguments: uri {str} -- attribute {str} -- Returns: list -- uris of the pages of the country """ # The second last two conditions filter out country of the past times that can be catched every now and then query = "select distinct ?f where{\ " + uri + " " + attribute + " ?s .\ ?s (dbo:wikiPageRedirects|dbo:wikiPageDisambiguates)* ?r .\ ?r (dbo:country|dbo:isPartOf|dbo:state|dbo:region|dbo:archipelago)* ?u .\ ?u (dbo:wikiPageRedirects|dbo:wikiPageDisambiguates)* ?f .\ filter not exists { ?f dbo:dissolutionYear ?w } .\ filter not exists { ?f dct:subject dbc:Imperialism } .\ filter not exists { ?f dbo:wikiPageDisambiguates ?w } .\ filter not exists { ?f dbo:wikiPageRedirects ?w } .\ }" results = query_sparql(query) if len(results) > 0: ret = [] results = [ f"<{r['f']['value']}>" for r in results if r['f']['type'] == 'uri'] for page in results: # USA not an interesting country, too common if page != '<http://dbpedia.org/resource/United_States>' and page not in ret: q = "ask{\ {" + page + " a dbo:Country}\ UNION\ {" + page + " dct:subject dbc:States_of_the_United_States}\ }" if ask_sparql(q): ret.append(page) if len(ret) > 0: return ret else: logging.getLogger('root.features').warning( f"{uri} had a valid {attribute} value, but I was not able to associate a country to it.")
def identify_date_attribute(uri, attribute): """Given a dbpedia page and an attribute, we extract a date value from that attribute Arguments: uri {str} -- attribute {str} -- """ query = 'SELECT DISTINCT ?s WHERE{' + \ uri + ' ' + attribute + ' ?s . }' results = query_sparql(query) if len(results) > 0: date = results[0]['s']['value'] try: date_converted = pd.to_datetime(date) return date_converted except Exception: logging.getLogger('root.features').warning( f"{uri} had a valid {attribute} value, but I was not able to extract a date from it." )
def get_abstract_uri(uri): # return 'debug' return query_sparql('select ?a where {'+ uri + ' dbo:abstract ?a . FILTER(langMatches(lang(?a),"en")) }')[0]['a']['value']
def search_label_space(label, narrowing_space_query='', selecting_results_query=''): """Searches in the whole dbpedia for the uris with the label which matches the variable label TODO: Beyoncé could not be found as label string, eventhough this is the label name of the actual page of the singer. Neither substrings of the former cannot be found (Beyon or Beyo), as if the page couldn't be seen from sparql. The same happens with the page of Aminé, José González, Jack Ü, Zhané, Björk. However, the dbpedia page is accessible. Apparently, we have problems with strange accents This method guarantees that the uris found are neither disambiguation nor redirection pages Arguments: label {string} -- narrowing_space_query {string} -- Query that define variable ?s. By default on this method, it can range on the space of all labels in dbpedia selection_results_query {string} -- Query that poses condition on variable ?f, selectiong the results that are returned by this method Returns: list -- uris found """ preprocessed_label = _preprocess_label(label) if len(preprocessed_label): q = "SELECT DISTINCT ?f { " + narrowing_space_query + \ " ?s rdfs:label ?label . FILTER(lang(?label)=\"en\") . ?label bif:contains " acc = "" for idx, token in enumerate(preprocessed_label.split(' ')): acc += f"'{token}'" if idx == 0 else f" and '{token}'" q += f"\"{acc}\" . " q += " ?s (dbo:wikiPageRedirects | dbo:wikiPageDisambiguates)* ?f . " q += " ?f rdfs:label ?l . FILTER(lang(?l)=\"en\") . ?l bif:contains " acc = "" for idx, token in enumerate(preprocessed_label.split(' ')): acc += f"'{token}'" if idx == 0 else f" and '{token}'" q += f"\"{acc}\" . " q += selecting_results_query q += "filter not exists { \ ?f dbo:wikiPageRedirects|dbo:wikiPageDisambiguates ?dis \ }" q += " }" results = query_sparql(q) uris_found = [ f"<{results[c]['f']['value']}>" for c in range(len(results)) ] # check, the uris found should not redirect or disambiguate for uri in uris_found: assert not ask_sparql( 'ask { {' + uri + ' dbo:wikiPageRedirects ?w } UNION {' + uri + ' dbo:wikiPageDisambiguates ?w } }' ), "The result is not expect to be neither a disambiguation nor a redirection page, something is incoherent" return uris_found else: return []