class Endpoint(object): ''' This class aim at identifying errors in DBpedia ENDPOINT when retrieving samples for training Positive/negative samples for candidate classes ''' ''' def queryTripleByClass(top_k, c): triples = list() s = sparql.Service(SPARQL_END_POINT, "utf-8", "GET") statement = 'select distinct str(?s), str(?p), str(?o), str(?l) where {?s ?p ?o. ?o rdf:type <%s>. ' \ '?o rdfs:label ?l. FILTER( langMatches(lang(?l), "en"))} ORDER BY RAND() limit %d' % (c, top_k) result = s.query(statement) for row in result.fetchone(): triples.append([row[0], row[1], row[2], row[3]]) return triples ''' def __init__(self): ''' Constructor ''' self.dbpedia_ep = DBpediaEndpoint() self.wikidata_ep = WikidataEndpoint() self.lookup = Lookup() def __analyseEntityPredicateStrategy(self, ent, cls_uri): ''' Analyses correctness of cls_uri as type of ent using Predicate types strategy ''' predicate_types = self.lookup.__getTypesPredicateStrategy(ent) if len(predicate_types) == 0: return None if cls_uri in predicate_types: return True return False def __analyseEntityLooukStrategy(self, ent, cls_uri): ''' Analyses correctness of cls_uri as type of ent using Look-up types ''' #Note that if not look-up types, then we return the sparql types as they are ##IF lookup types, the sparql types must be compatible clean_lookup_types = self.lookup.getTypesForEntity(ent, KG.DBpedia) if len(clean_lookup_types) == 0: return None if cls_uri in clean_lookup_types: return True return False def __analyseEntityWikidataStrategy(self, ent, cls_uri, wikidata_classes): ''' Analyses correcteness of cls_uri as type of ent using wikidata ''' #b. Get equivalent wikidata entity (if any) same_entities = self.dbpedia_ep.getSameEntities(ent) wikidata_entities = getFilteredResources( same_entities, KG.Wikidata) ##typically one entity ##If no equivalent entities we then go for the lookup strategy if len(wikidata_entities) == 0: return self.__analyseEntityLooukStrategy(ent, cls_uri) #print(wikidata_entities) for wk_ent in wikidata_entities: #c. Check if wikidata type from (a) is within types of equivalent entity from (b) #print("\t"+wk_ent) wk_ent_types = self.wikidata_ep.getAllTypesForEntity( wk_ent) #we consider supertypes to extend compatibility time.sleep(0.01) #to avoid limit of calls intersect = wk_ent_types.intersection(wikidata_classes) if len(intersect) > 0: return True return False def getEntitiesForDBPediaClass(self, cls_uri, limit=1000): ''' It currently expects a URL from DBpedia ''' ##We query a subset of entities for sampling clean_db_entities = dict() offset = 0 #To guarantee the required number of (clean) entities for the class while len(clean_db_entities) < limit: #db_entities = self.dbpedia_ep.getEntitiesForType(cls_uri, offset*limit*5, limit*5) #We extract more than required as many of them will be noisy db_entities = self.dbpedia_ep.getEntitiesLabelsForType( cls_uri, offset * limit * 5, limit * 5) #print("Entities",len(db_entities)) #For wikidata strategy #a. Get equivalent class from wikidata (if any) db_eq_cls = self.dbpedia_ep.getEquivalentClasses(cls_uri) wikidata_classes = getFilteredTypes( db_eq_cls, KG.Wikidata) ##typically one class filtered_look_up = 0 filtered_wikidata = 0 filtered_predicates = 0 for ent in db_entities: if len(clean_db_entities) >= limit: print("%d, %d, %d, %d" % (len(clean_db_entities), filtered_look_up, filtered_predicates, filtered_wikidata)) return clean_db_entities results_look_up = self.__analyseEntityLooukStrategy( ent, cls_uri) if results_look_up == None: results_predicates = self.__analyseEntityPredicateStrategy( ent, cls_uri) if results_predicates == None: if self.__analyseEntityWikidataStrategy( ent, cls_uri, wikidata_classes ): #wikidata strategy (it is very costly) clean_db_entities[ent] = db_entities[ent] else: filtered_wikidata += 1 #print ("Entity filtered by wikidata", ent) elif results_predicates: #passed predicates strategy clean_db_entities[ent] = db_entities[ent] else: #print ("Entity filtered by predicates", ent) filtered_predicates += 1 elif results_look_up: #passed look-up strategy clean_db_entities[ent] = db_entities[ent] else: #print ("Entity filtered by look-up", ent) filtered_look_up += 1 #OLD STRATEGY: too slow #if (len(wikidata_classes)==0): ## No wikidata class then look-up strategy # for ent in db_entities: # if len(clean_db_entities)>=limit: # return clean_db_entities # if self.__analyseEntityLooukStrategy(ent, cls_uri): # clean_db_entities.add(ent) #else: # for ent in db_entities: # if len(clean_db_entities)>=limit: # return clean_db_entities # if self.__analyseEntityWikidataStrategy(ent, cls_uri, wikidata_classes): # clean_db_entities.add(ent) #print(len(clean_db_entities)) offset += 1 #Limit of iterations if offset > 5: print("%d, %d, %d, %d" % (len(clean_db_entities), filtered_look_up, filtered_predicates, filtered_wikidata)) return clean_db_entities print("%d, %d, %d, %d" % (len(clean_db_entities), filtered_look_up, filtered_predicates, filtered_wikidata)) return clean_db_entities
def extensionWithWikiRedirects(file_gt, folder_tables, file_out_gt, file_out_redirects_gt, file_out_gt_target, max_rows): #print(csv_file_name) f_out = open(file_out_gt, "a+") f_out_redirects = open(file_out_redirects_gt, "a+") f_out_target = open(file_out_gt_target, "a+") n_rows = 0 panda_errors = 0 dbpedia_ep = DBpediaEndpoint() table_id = "" dict_entities = dict() #READ CURRENT CACHE #init dict_entities with current state of file_out_redirects_gt with open(file_out_redirects_gt) as csv_file_redirections: csv_reader = csv.reader(csv_file_redirections, delimiter=',', quotechar='"', escapechar="\\") #"1","0","1","http://dbpedia.org/resource/Uno_Von_Troil http://dbpedia.org/resource/Uno_von_Troil" for row in csv_reader: entity_list = row[3].split(" ") for e in entity_list: if e not in dict_entities: dict_entities[e] = set(entity_list) with open(file_gt) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\") for row in csv_reader: #file, col, row, URI #note that in Okties GT it is given file, row, col #1,1,0,http://dbpedia.org/resource/Uno_von_Troil if len(row) < 4: continue #entity_uri = row[3] entity_uri = row[3].replace("\"", "%22") #To avoid cases from "http://sws.geonames.org/" #if entity_uri.startswith("http://sws.geonames.org/"): same_as_resources = set() if not entity_uri.startswith("http://dbpedia.org/resource/"): #print(getFilteredResources(dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia)) same_as_resources = getFilteredResources( dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia) #print(row[0]) #print(row[1]) #print(row[2]) #print(entity_uri) if len(same_as_resources) == 0: print("No dbpedia entity for: %s, %s, %s, %s" % (row[0], row[1], row[2], entity_uri)) else: #We keep only one of the same_as dbpedia resoruces for r in same_as_resources: entity_uri = r ##We will consider other same as later same_as_resources.remove(entity_uri) #break entity_uri = row[3].replace("\"", "%22") #if int(row[0])<1000: #Jiaoyan starts from table file 1,000 #if int(row[0])>=1000: #ernesto #if int(row[0])>=3: #ernesto #if int(row[0])<587 or int(row[0])>=1000: # continue if not table_id == row[0]: #Change of table we close and then reopen again to keep a better storage of intermediate points f_out.close() f_out_redirects.close() f_out_target.close() table_id = row[0] print(table_id) f_out = open(file_out_gt, "a+") f_out_redirects = open(file_out_redirects_gt, "a+") f_out_target = open(file_out_gt_target, "a+") col_id = row[2] #Reverse according to input row_id = row[1] csv_file_name = table_id + ".csv" try: #Try to open with pandas. If error, then discard file pd.read_csv(join(folder_tables, csv_file_name), sep=',', quotechar='"', escapechar="\\") #df = csv.reader(csv_file) except: panda_errors += 1 continue entities = set() ##Keep and index to avoid unnecessary queries if entity_uri in dict_entities: entities.update(dict_entities[entity_uri]) else: #entities=set() new_entities = set() ##Consider redirects: entities.update(dbpedia_ep.getWikiPageRedirect(entity_uri)) entities.update(dbpedia_ep.getWikiPageRedirectFrom(entity_uri)) entities.update( same_as_resources) #in case there were more than one #two iterations for e in entities: new_entities.update(dbpedia_ep.getWikiPageRedirect(e)) new_entities.update(dbpedia_ep.getWikiPageRedirectFrom(e)) entities.add(entity_uri) entities.update(new_entities) dict_entities[entity_uri] = set() dict_entities[entity_uri].update(entities) #Output #table id, column id, row id and DBPedia entity #9206866_1_8114610355671172497,0,121,http://dbpedia.org/resource/Norway line_str = '\"%s\",\"%s\",\"%s\"' % (table_id, col_id, row_id) f_out_target.write(line_str + '\n') #f_out.write('\"%s\",\"%s\",\"%s\",\"%s\"\n' % (table_id, column_id, row_id, entity_uri)) f_out.write(line_str + ',\"%s\"\n' % entity_uri) #for ent in entities: # line_str += ',\"'+ ent + '\"' line_str += ',\"' + " ".join(entities) + '\"' f_out_redirects.write(line_str + '\n') ##Number of rows if n_rows > max_rows: #200000 break n_rows += 1 print("Panda errors: %d" % panda_errors) f_out.close() f_out_redirects.close() f_out_target.close()
class Lookup(object): ''' This class aim at providing a lookup access to the KG leading to minimal errors It will also optionally combine. No only one KG but several ''' def __init__(self): #KGraph=KG.DBpedia ''' Constructor ''' #Return types from this knowledge graph #self.KGraph = KGraph self.dbpedia_onto = DBpediaOntology() self.dbpedia_onto.loadOntology(True) self.schema_onto = SchemaOrgOntology() self.schema_onto.loadOntology(True) self.dbpedia_ep = DBpediaEndpoint() self.wikidata_ep = WikidataEndpoint() def getTypesForEntity(self, uri_entity, kg=KG.DBpedia): #print("QUERY",uri_entity) if kg == KG.DBpedia: types = set() types_redirects = set() #Original entity types.update(self.getTypesForDBPediaEntity(uri_entity)) #Redirects if any #See dbo:wikiPageRedirects -> similar to same_as inside dbpedia redirects = self.dbpedia_ep.getWikiPageRedirect(uri_entity) for uri_redirect in redirects: #Typically only one types_redirects.update( self.getTypesForDBPediaEntity(uri_redirect)) if len(types) == 0: #We use the ones of the redirects types.update(types_redirects) else: #types of redirects can be dirty for t in types_redirects: if self.__checkCompatibilityTypes(t, types): types.add(t) #Commented because it was slow for a large dataset #if is_empty(types) or (len(types)==1 and "Agent" in list(types)[0]): # #Wikidata strategy to complement if empty endpoint and look-up or only type "Agent" # print(types) # types.update(self.__getTypesWikidataStrategy(uri_entity)) # print(types) return types #TBC elif kg == KG.Wikidata: pass elif kg == KG.Google: pass return set() def __getTypesLookupStrategy(self, uri_entity): kg = KG.DBpedia label = uri_entity if uri_entity.startswith(URI_KG.dbpedia_uri_resource): label = uri_entity.replace(URI_KG.dbpedia_uri_resource, '') ##we call our method to get look-up types for the URI. Only SPARQL endpoint types may contain errors #It also includes wikidata strategy inside entities = self.getKGEntities(label, 10, uri_entity) #fulter by uri_entity ##In case not match in look up if is_empty(entities): types = set() #Dbpedia Enpoint strategy types_endpoint = getFilteredTypes( self.dbpedia_ep.getAllTypesForEntity(uri_entity), KG.DBpedia) #Predicates strategy (uses top types) types_domain_range = self.__getTypesPredicateStrategy(uri_entity) if len(types_domain_range) > 0: #They can be noisy, so do not add them yet #types.update(types_domain_range) ##Check compatibility of types_endpoint for t in types_endpoint: if t not in types_domain_range: if self.__checkCompatibilityTypes( t, types_domain_range): types.add(t) #If no compatible types we just use the ones coming from domain/ranges if len(types) > 0: types.update(types_domain_range) #If still empty we use if len(types) == 0: #We add endpoint types types.update(types_endpoint) return types else: ##shoudl be only one element from lookup for entity in entities: return entity.getTypes(kg) def __getTypesPredicateStrategy(self, uri_entity): ''' Exploits the domain and range types of the predicates in triples with uri_entity as subject or object ''' types = set() #Top-2 types.update( getFilteredTypes( self.dbpedia_ep.getTopTypesUsingPredicatesForObject( uri_entity, 2), KG.DBpedia)) #We use top one here, as there less properties associated to an entity. And We only need one wrong to be in the top-k #Top-1 #Error-prone as many properties are not properly used #Only uses if current range types are compatible types_domain = getFilteredTypes( self.dbpedia_ep.getTopTypesUsingPredicatesForSubject( uri_entity, 1), KG.DBpedia) #if len(types)==0: # types.update(types_domain) #else: if len(types) > 0: for t in types_domain: if self.__checkCompatibilityTypes(t, types): types.add(t) #TODO: Alternative Strategies: use intersection of types_range and types_domain in non empty. May increase recall (remove min input/output edges in queries) and precision #If empty then use as now. return types def __getTypesWikidataStrategy(self, uri_entity): print("\tUsing wikidata strategy for " + uri_entity) #Gets equivalent wikidata entities same_entities = self.dbpedia_ep.getSameEntities(uri_entity) wikidata_entities = getFilteredResources( same_entities, KG.Wikidata) ##typically one entity wk_ent_types = set() dp_types = set() dp_types_all = set() if len(wikidata_entities) == 0: return wk_ent_types for wk_ent in wikidata_entities: #print("WK ent: "+wk_ent) #Get types for wikidata entities wk_ent_types.update(self.wikidata_ep.getAllTypesForEntity( wk_ent)) #we consider all supertypes to extend compatibility #Problematic concept #Wikimedia disambiguation page if URI_KG.wikimedia_disambiguation_concept in wk_ent_types: wk_ent_types.clear() #Check if: wk_ent_types for t in wk_ent_types: #print("WK cls: " +t) #Get equivalent dbpedia types #print(self.wikidata_ep.getEquivalentClasses(t)) dp_types.update( getFilteredTypes(self.wikidata_ep.getEquivalentClasses(t), KG.DBpedia)) #get superclasses for t in dp_types: #print("DBp type: " +t) dp_types_all.update(self.dbpedia_ep.getAllSuperClasses(t)) return getFilteredTypes(dp_types_all, KG.DBpedia) def getTypesForDBPediaEntity(self, uri_entity): #types=set() #Types from DBpedia Endpoint may be dirty. So we use 2 strategies: wikidata and lookup types #TODO: check compatibility among strategies? #Look-up strategy also includes wikidata strategy types = self.__getTypesLookupStrategy(uri_entity) #print("Main", types) return types ##Additional strategy... #Check equivalent entity from wikidata, get classes from wikidata, get equivalent in dbpedia enpoint def getKGEntities(self, cell, limit=5, filter=''): ''' Given the text of a cell extracts entity objects. Note that an entity contains an id, a label, a description, a set of types from dbpedia, wikidata and schema.org, and the source (dbpedia, wikidata or google) ''' #Strategy: #0. Incremental repair: start with sth simple #1. Identify cases where dbpedia lookup and endpoint do not agree #2. We have dbpedia (may require a fix...) and schema.org taxonomies to identify conflicting branches #3. Use alignment to identify same entities and have more evidence about types #3a. Lexical #3b. Based on embeddings #4. If not provided mapping among classes (there seem to be available mappings), we may use alignment as well (lexical and embedding) query = cell #Get KG entities from DBpedia, Wikidata and KG #One could also return the most accurate 5 types combining the 3 KGs... (limit 20 of each of them and then retrieve top-k) dbpedia = DBpediaLookup() dbpedia_entities = dbpedia.getKGEntities(query, limit, filter) #We complement with types from endpoint and check if they are correct/compatible for entity in dbpedia_entities: self.__analyseEntityTypes(entity) return dbpedia_entities #Next steps #Find equivalent entities from wikidata (using both wikidata and dbpedia endpoints), #then its types and then try to find conflictive types (it could even be by voting) ''' kg = GoogleKGLookup() wikidata = WikidataAPI() ''' def __analyseEntityTypes(self, entity): #print(entity.getId()) #print("\t"+str(entity.getTypes(KG.DBpedia))) #Filter by type? types_endpoint = getFilteredTypes( self.dbpedia_ep.getAllTypesForEntity(entity.getId()), KG.DBpedia) #print("\t"+str(types_endpoint)) if len(entity.getTypes()) > 0: for t in types_endpoint: if t not in entity.getTypes(): ##Evaluate compatibility with lookup types. ##In same branch ##We use DBpedia for now if self.__checkCompatibilityTypes( t, entity.getTypes(KG.DBpedia)): entity.addType(t) else: #No types from lookup #We use wikidata strategy #Not great for compatibility as we need to better explore the returned types #types_wk_strategy = self.__getTypesWikidataStrategy(entity.getId()) #We use range-domain-predicate strategy (uses top-types) types_domain_range = self.__getTypesPredicateStrategy( entity.getId()) if len(types_domain_range) > 0: #They can be noisy, so do not add them yet #entity.addTypes(types_domain_range) ##Check compatibility of types_endpoint for t in types_endpoint: if t not in types_domain_range: if self.__checkCompatibilityTypes( t, types_domain_range): entity.addType(t) #DANGEROUS, as domain and range types contain amny errors #If no compatible type we just use the ones coming from domain/ranges #if len(entity.getTypes())>0: # entity.addTypes(types_domain_range) #If still empty we use endpoint if len(entity.getTypes()) == 0: #We add endpoint types entity.addTypes(types_endpoint) ##Last resource if not types if len(entity.getTypes()) > 0: entity.addTypes(types_domain_range) #We complement with wikidata strategy #entity.addTypes(types_wk_strategy) #print("\t"+str(entity.getTypes(KG.DBpedia))) ''' We check if the source type (endpoint) is among descendants or ancestors of at least one of the target types (lookup) ''' def __checkCompatibilityTypes(self, cls_source_uri, target_types): for cls_target_uri in target_types: if self.__isCompatibleType(cls_source_uri, cls_target_uri): return True return False ''' We check if the source type is among descendants or ancestors of the target type ''' def __isCompatibleType(self, cls_source_uri, cls_target_uri): cls_source = self.dbpedia_onto.getClassByURI(cls_source_uri) cls_target = self.dbpedia_onto.getClassByURI(cls_target_uri) ##TODO We rely on DBpedia only for now if cls_source == None or cls_target == None: return False ancestors = self.dbpedia_onto.getAncestorsURIs(cls_target) descendants = self.dbpedia_onto.getDescendantURIs(cls_target) if cls_source_uri in ancestors or cls_source_uri in descendants: return True return False