def __init__(self): ''' Constructor ''' self.dbpedia_ep = DBpediaEndpoint() self.wikidata_ep = WikidataEndpoint() self.lookup = Lookup()
def apply_endpoint(entity): # find types # print(entity) ep = DBpediaEndpoint() # using ID/int ent2 = entity.getIdstr() types = ep.getTypesForEntity(ent2) # limit to 5 # print('types using endpoint id', types) if len(types) == 0: # using entity: back up types = entity.getTypes() # ont # print('types using entity', types, '\n') return types
def __init__(self): ''' Constructor ''' #Set up #self.setUpRDFGraph() self.entities = set() self.types = set() #prop: True -> isObjectProperty self.propertyType = dict() self.dbp_endpoint = DBpediaEndpoint() self.lookup = DBpediaLookup()
def __init__(self): #KGraph=KG.DBpedia ''' Constructor ''' #Return types from this knowledge graph #self.KGraph = KGraph self.dbpedia_onto = DBpediaOntology() self.dbpedia_onto.loadOntology(True) self.schema_onto = SchemaOrgOntology() self.schema_onto.loadOntology(True) self.dbpedia_ep = DBpediaEndpoint() self.wikidata_ep = WikidataEndpoint()
def apply_endpoint_list(entity_list): # find types types_list = [] ep = DBpediaEndpoint() # using ID/int for entity in entity_list: print(entity) # ['N/A'] # < id: http://dbpedia.org/resource/Ana_Popović, label: Ana Popović, description: None, types: set(), source: DBpedia > if entity != ['N/A']: ent2 = entity.getIdstr() types = ep.getTypesForEntity(ent2) # limit to 5 # print('types using endpoint id', types) types_list.append(types) if len(types) == 0: # using entity: back up types = entity.getTypes() # ont # print('types using entity', types, '\n') types_list.append(types) else: types = [] types_list.append(types) return types_list
def tablesToChallengeFormat(folder_gt, folder_tables, file_out_gt, file_out_redirects_gt, file_out_gt_target, max_tables): #csv_file_names = [f for f in listdir(folder_gt) if isfile(join(folder_gt, f))] csv_file_names = [] csv_file_names.append("2014_Tour_of_Turkey#6.csv") f_out = open(file_out_gt, "w+") f_out_redirects = open(file_out_redirects_gt, "w+") f_out_target = open(file_out_gt_target, "w+") n_tables = 0 wrong_entities = 0 panda_errors = 0 dbpedia_ep = DBpediaEndpoint() for csv_file_name in csv_file_names: #print(csv_file_name) with open(join(folder_gt, csv_file_name)) as csv_file: try: #Try to open with pandas. If error, then discard file pd.read_csv(join(folder_tables, csv_file_name), sep=',', quotechar='"', escapechar="\\") #df = csv.reader(csv_file) except: panda_errors += 1 continue table_id = csv_file_name.replace(".csv", "") csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\") for row in csv_reader: #URI, text, row number #http://dbpedia.org/resource/Ahmet_%C3%96rken, Ahmet A\u0096rken, 1 if len(row) < 3: continue entity_uri = row[0] row_id = row[2] entity_mention = row[1] column_id = getColumnEntityMention( join(folder_tables, csv_file_name), entity_mention) entities = set() new_entities = set() ##Consider redirects: #entities.update(dbpedia_ep.getWikiPageRedirect(entity_uri)) #entities.update(dbpedia_ep.getWikiPageRedirectFrom(entity_uri)) #for e in entities: # new_entities.update(dbpedia_ep.getWikiPageRedirect(e)) # new_entities.update(dbpedia_ep.getWikiPageRedirectFrom(e)) entities.add(entity_uri) #entities.update(new_entities) if column_id >= 0: #Output #table id,column id, row id and DBPedia entity #9206866_1_8114610355671172497,0,121,http://dbpedia.org/resource/Norway line_str = '\"%s\",\"%s\",\"%s\"' % (table_id, column_id, row_id) f_out_target.write(line_str + '\n') #f_out.write('\"%s\",\"%s\",\"%s\",\"%s\"\n' % (table_id, column_id, row_id, entity_uri)) f_out.write(line_str + ',\"%s\"\n' % entity_uri) #for ent in entities: # line_str += ',\"'+ ent + '\"' line_str += ',\"' + " ".join(entities) + '\"' f_out_redirects.write(line_str + '\n') #TODO #Read with pandas: https://www.datacamp.com/community/tutorials/pandas-read-csv #There are some errors with "\" #writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC) #writer.writerow() #print('\"%s\",\"%s\",\"%s\",\"%s\"' % (table_id, column_id, row_id, entity_uri)) else: wrong_entities += 1 ##Small dataset with only approx. 20k tables out of >400k if n_tables > max_tables: #200000 break n_tables += 1 print("Panda errors: %d" % panda_errors) print("Wrong entities: %d" % wrong_entities) f_out.close() f_out_redirects.close() f_out_target.close()
def extensionWithWikiRedirects(file_gt, folder_tables, file_out_gt, file_out_redirects_gt, file_out_gt_target, max_rows): #print(csv_file_name) f_out = open(file_out_gt, "a+") f_out_redirects = open(file_out_redirects_gt, "a+") f_out_target = open(file_out_gt_target, "a+") n_rows = 0 panda_errors = 0 dbpedia_ep = DBpediaEndpoint() table_id = "" dict_entities = dict() #READ CURRENT CACHE #init dict_entities with current state of file_out_redirects_gt with open(file_out_redirects_gt) as csv_file_redirections: csv_reader = csv.reader(csv_file_redirections, delimiter=',', quotechar='"', escapechar="\\") #"1","0","1","http://dbpedia.org/resource/Uno_Von_Troil http://dbpedia.org/resource/Uno_von_Troil" for row in csv_reader: entity_list = row[3].split(" ") for e in entity_list: if e not in dict_entities: dict_entities[e] = set(entity_list) with open(file_gt) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\") for row in csv_reader: #file, col, row, URI #note that in Okties GT it is given file, row, col #1,1,0,http://dbpedia.org/resource/Uno_von_Troil if len(row) < 4: continue #entity_uri = row[3] entity_uri = row[3].replace("\"", "%22") #To avoid cases from "http://sws.geonames.org/" #if entity_uri.startswith("http://sws.geonames.org/"): same_as_resources = set() if not entity_uri.startswith("http://dbpedia.org/resource/"): #print(getFilteredResources(dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia)) same_as_resources = getFilteredResources( dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia) #print(row[0]) #print(row[1]) #print(row[2]) #print(entity_uri) if len(same_as_resources) == 0: print("No dbpedia entity for: %s, %s, %s, %s" % (row[0], row[1], row[2], entity_uri)) else: #We keep only one of the same_as dbpedia resoruces for r in same_as_resources: entity_uri = r ##We will consider other same as later same_as_resources.remove(entity_uri) #break entity_uri = row[3].replace("\"", "%22") #if int(row[0])<1000: #Jiaoyan starts from table file 1,000 #if int(row[0])>=1000: #ernesto #if int(row[0])>=3: #ernesto #if int(row[0])<587 or int(row[0])>=1000: # continue if not table_id == row[0]: #Change of table we close and then reopen again to keep a better storage of intermediate points f_out.close() f_out_redirects.close() f_out_target.close() table_id = row[0] print(table_id) f_out = open(file_out_gt, "a+") f_out_redirects = open(file_out_redirects_gt, "a+") f_out_target = open(file_out_gt_target, "a+") col_id = row[2] #Reverse according to input row_id = row[1] csv_file_name = table_id + ".csv" try: #Try to open with pandas. If error, then discard file pd.read_csv(join(folder_tables, csv_file_name), sep=',', quotechar='"', escapechar="\\") #df = csv.reader(csv_file) except: panda_errors += 1 continue entities = set() ##Keep and index to avoid unnecessary queries if entity_uri in dict_entities: entities.update(dict_entities[entity_uri]) else: #entities=set() new_entities = set() ##Consider redirects: entities.update(dbpedia_ep.getWikiPageRedirect(entity_uri)) entities.update(dbpedia_ep.getWikiPageRedirectFrom(entity_uri)) entities.update( same_as_resources) #in case there were more than one #two iterations for e in entities: new_entities.update(dbpedia_ep.getWikiPageRedirect(e)) new_entities.update(dbpedia_ep.getWikiPageRedirectFrom(e)) entities.add(entity_uri) entities.update(new_entities) dict_entities[entity_uri] = set() dict_entities[entity_uri].update(entities) #Output #table id, column id, row id and DBPedia entity #9206866_1_8114610355671172497,0,121,http://dbpedia.org/resource/Norway line_str = '\"%s\",\"%s\",\"%s\"' % (table_id, col_id, row_id) f_out_target.write(line_str + '\n') #f_out.write('\"%s\",\"%s\",\"%s\",\"%s\"\n' % (table_id, column_id, row_id, entity_uri)) f_out.write(line_str + ',\"%s\"\n' % entity_uri) #for ent in entities: # line_str += ',\"'+ ent + '\"' line_str += ',\"' + " ".join(entities) + '\"' f_out_redirects.write(line_str + '\n') ##Number of rows if n_rows > max_rows: #200000 break n_rows += 1 print("Panda errors: %d" % panda_errors) f_out.close() f_out_redirects.close() f_out_target.close()
from ontology.onto_access import DBpediaOntology # unpickle # load = unpickle('training_vectors/final_original_training_vectors') # when we have training data from task to eval load = unpickle('training_vectors/final_original_training_vectors_minus_tests' ) # created own testing data from splitting train df_positive = pd.DataFrame(load) df_positive['polarity'] = "1" '''create more positive samples do this by: - getting a different but similar entity using SPARQLEndpoint ''' onto_access = DBpediaOntology() onto_access.loadOntology(True) ep = DBpediaEndpoint() # getEntitiesForType def get_alt_entities(entity_types): lis = [] for ls in entity_types: # print('ls', ls) enty = get_last(ls) # only get finest entity # print('entity:', enty) try: # simty = ep.getEntitiesForDBPediaClass(enty, 100) - slower version simty = ep.getEntitiesForType(enty, 0, 10) lis.append(simty) # print('similar entity', simty) except: pass
class DBPediaExtractor(object): ''' classdocs ''' #Steps # Read entities from GT, include redirections and add same_as axioms # Query for types # Query for 1 level of relationships, filter those non in dbpedia ontology def __init__(self): ''' Constructor ''' #Set up #self.setUpRDFGraph() self.entities = set() self.types = set() #prop: True -> isObjectProperty self.propertyType = dict() self.dbp_endpoint = DBpediaEndpoint() self.lookup = DBpediaLookup() def isValidURI(self, str_uri): #use term._is_valid_unicode(str_uri) return term._is_valid_uri(str_uri) and self.isascii(str_uri) def isascii(self, string_original): string = self.strip_accents(string_original) #to ignore accents return len(string_original) == len(string.encode()) def strip_accents(self, text): text = unicodedata.normalize('NFD', text)\ .encode('ascii', 'ignore')\ .decode("utf-8") return str(text) #Precomputed def setUpLocalDBPediaGraph(self, file_ttl): self.localrdfgraph = Graph() self.localrdfgraph.parse(source=file_ttl, format='turtle') #To be computed def setUpRDFGraph(self): self.rdfgraph = Graph() #self.rdfgraph.bind(TabularToRDF.NAMESPACE_PREFIX, TabularToRDF.BASE_URI) self.rdfgraph.bind("foaf", "http://xmlns.com/foaf/0.1/") self.rdfgraph.bind("dbp", "http://dbpedia.org/property/") self.rdfgraph.bind("dbr", "http://dbpedia.org/resource/") self.rdfgraph.bind("dbo", "http://dbpedia.org/ontology/") self.rdfgraph.bind("owl", "http://www.w3.org/2002/07/owl#") def saveRDFGrah(self, rdf_file_ouput): #output same table name as ttl self.rdfgraph.serialize(str(rdf_file_ouput), format='turtle')#xml,turtle wrong_file_name = "" try: if "?" in rdf_file_ouput: wrong_file_name = rdf_file_ouput.split("?")[0] os.rename(wrong_file_name, rdf_file_ouput) elif "#" in rdf_file_ouput: wrong_file_name = rdf_file_ouput.split("#")[0] os.rename(wrong_file_name, rdf_file_ouput) #print(wronf_file_name) except: print(wrong_file_name, rdf_file_ouput) def getTargetEntitiesCEA(self, cea_file): #Target entiteis for table self.targetEntities = dict() with open(cea_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\") for row in csv_reader: if len(row)<4: continue uris = row[3].split(" ") #entities per table key = row[0] #+ "-"+ row[1] + "-"+ row[2] if key not in self.targetEntities: self.targetEntities[key]=set() self.targetEntities[key].update(uris) def getEntitiesAndCreateInstancesTable(self, table_name): if table_name in self.targetEntities: for ent in self.targetEntities[table_name]: if self.isValidURI(ent): self.entities.add(ent) e_uri = URIRef(ent) self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) ) #else: # pass def getEntitiesAndCreateInstances(self, cea_file): with open(cea_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\") for row in csv_reader: if len(row)<4: continue uris = row[3].split(" ") for i in range(len(uris)): self.entities.add(uris[i]) i=1 while i<len(uris): if self.isValidURI(uris[0]) and self.isValidURI(uris[i]): e_uri1 = URIRef(uris[0]) e_uri2 = URIRef(uris[i]) self.rdfgraph.add( (e_uri1, URIRef(OWL.SAMEAS), e_uri2) ) else: pass #print("Not valid URI?", uris[0], uris[i]) i+=1 for ent in self.entities: if self.isValidURI(ent): e_uri = URIRef(ent) self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) ) else: pass #print("Not valid URI:", ent) print("Number of entities: " + str(len(self.entities))) def getTargetColumns(self, cea_gt_file): self.target_column = dict() #An alternative is to automatically identify the left most column with an entity mention. #In this particular case we know the target with open(cea_gt_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\") for row in csv_reader: if row[0] not in self.target_column or int(self.target_column[row[0]]) > int(row[1]): self.target_column[row[0]] = row[1] def getEntitiesLookup(self, folder_cea_tables, cea_gt_file): #Lookup call for each cell in target column #Dictionary or cache to avoid repeated look-up visited_values = set() #Get Target column self.getTargetColumns(cea_gt_file) csv_file_names = [f for f in listdir(folder_cea_tables) if isfile(join(folder_cea_tables, f))] i=0 n=len(csv_file_names) t=[1, 5, 10, 50, 100, 250, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000] for csv_file in csv_file_names: i+=1 if i in t: print("Getting look up entities for table %s of %s (%s)." % (i, n, datetime.datetime.now().time())) table_name = csv_file.replace(".csv", "") with open(join(folder_cea_tables, csv_file)) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\") if table_name in self.target_column: target_column = self.target_column[table_name] else: #End continue for row in csv_reader: if len(row)<=int(target_column): continue if row[int(target_column)] not in visited_values: ##To avoid repetition visited_values.add(row[int(target_column)]) #Lookup top-3 dbpedia_entities = self.lookup.getKGEntities(row[int(target_column)], 3, '') for ent in dbpedia_entities: if self.isValidURI(ent.getId()): self.entities.add(ent.getId()) ##Add to entities to extract neighbours e_uri = URIRef(ent.getId()) self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) ) for cls_type in ent.getTypes(KG.DBpedia): self.rdfgraph.add( (e_uri, RDF.type, URIRef(cls_type)) ) else: #print("Not valid URI:", ent.getId()) pass print("Number of extended entities with look-up: " + str(len(self.entities))) def getEntitiesLookupForTable(self, csv_file): #Lookup call for each cell in target column #Dictionary or cache to avoid repeated look-up visited_values = set() table_name = csv_file.replace(".csv", "") with open(join(folder_cea_tables, csv_file)) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\") if table_name in self.target_column: target_column = self.target_column[table_name] else: #End return for row in csv_reader: if len(row)<=int(target_column): return if row[int(target_column)] not in visited_values: ##To avoid repetition visited_values.add(row[int(target_column)]) #Lookup top-3 dbpedia_entities = self.lookup.getKGEntities(row[int(target_column)], 3, '') for ent in dbpedia_entities: if self.isValidURI(ent.getId()): self.entities.add(ent.getId()) ##Add to entities to extract neighbours e_uri = URIRef(ent.getId()) self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) ) for cls_type in ent.getTypes(KG.DBpedia): self.rdfgraph.add( (e_uri, RDF.type, URIRef(cls_type)) ) else: #print("Not valid URI:", ent.getId()) pass def getTypes(self, cta_file): with open(cta_file) as csv_file: csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\") for row in csv_reader: if len(row)<3: continue self.types.add(row[2]) print("Number of types: " + str(len(self.types))) def getAssertionsForInstances(self, use_local_graph): #avoid some properties (see entity.py) #Differentiate between object and data properties? probably only necessary if literal or URI #Problem if range of property is not string. It will probably not match very well in any case. #Solution: remove domains and ranges in dbpedia ontology properties #Filter by dbpedia resources and types, eg: ignore URis from wikidata and types from other taxonomies. n=0 l=[1, 5, 100, 1000, 2000, 3000, 4000, 5000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000] for ent in self.entities: n+=1 if self.isValidURI(ent): e_uri = URIRef(ent) #if n in l: # print("Extracting neighbourhood for " + str(n) + ": " + ent + " (" + str(datetime.datetime.now().time()) + ")") if use_local_graph: dict_results = self.getLocalTriplesForSubject(ent, 100) else: dict_results = self.dbp_endpoint.getTriplesForSubject(ent, 100) for prop in dict_results: #if prop.startswith("http://dbpedia.org/"): #There are othe rinteresting properties: rdfs:labels, rdf:type, fiar:nameetc if self.isValidURI(prop): p_uri = URIRef(prop) isObjectProperty = self.identifyTypeOfProperty(prop) for obj in dict_results[prop]: #Triple to resource if obj.startswith("http") and isObjectProperty: if obj.startswith("http://dbpedia.org/resource/"): if self.isValidURI(obj): o_uri = URIRef(obj) self.rdfgraph.add( (e_uri, p_uri, o_uri) ) else: #print("Not valid URI:", obj) pass elif not isObjectProperty: #Triple to Literal self.rdfgraph.add( (e_uri, p_uri, Literal(obj)) ) else: #print("Wrong object '%s' for property '%s' (isObjectProperty=%s)" % (obj, prop, isObjectProperty) ) pass else: #print("Not valid URI:", prop) pass else: pass #print("Not valid URI:", ent) def getLocalTriplesForSubject(self, ent, limit): query_str = "SELECT DISTINCT ?p ?o WHERE { <" + ent + "> ?p ?o } limit " + str(limit) query_object = prepareQuery(query_str)#, initNs={CMR_QA.NAMESPACE_PREFIX : CMR_QA.BASE_URI}) results = self.localrdfgraph.query(query_object) assertions = dict() for result in results: #print(str(result[0]), str(result[1])) prop = str(result[0]) obj = str(result[1]) if prop not in assertions: assertions[prop]=set() assertions[prop].add(obj) #print(assertions) return assertions def identifyTypeOfProperty(self, prop): if prop in self.propertyType: if self.propertyType[prop]: self.rdfgraph.add( (URIRef(prop), RDF.type, URIRef(OWL.OWLOBJECTPROPERTY)) ) else: self.rdfgraph.add( (URIRef(prop), RDF.type, URIRef(OWL.OWLDATAPROPERTY)) ) return self.propertyType[prop] #Get statistics from endpoint values = self.dbp_endpoint.getSomeValuesForPredicate(prop) n_values = len(values) n_uris = 0 for v in values: if v.startswith("http"): n_uris+=1 isObjectProperty = (n_uris > (n_values/2)) if isObjectProperty: self.rdfgraph.add( (URIRef(prop), RDF.type, URIRef(OWL.OWLOBJECTPROPERTY)) ) self.propertyType[prop]=True else: self.rdfgraph.add( (URIRef(prop), RDF.type, URIRef(OWL.OWLDATAPROPERTY)) ) self.propertyType[prop]=False return isObjectProperty def getInstancesForTypes(self): #Use basic method additional_entities = set() for cls in self.types: #print("Extracting members for: " + cls) additional_entities = self.dbp_endpoint.getEntitiesForType(cls, 0, 100) #We also want to extract neighbourhood self.entities.update(additional_entities) for ent in additional_entities: if self.isValidURI(ent): e_uri = URIRef(ent) if cls.startswith("http://dbpedia.org/"): self.rdfgraph.add( (e_uri, RDF.type, URIRef(cls)) ) else: #print("Not valid URI:", ent) pass print("Number of extended entities with types: " + str(len(self.entities))) #Using pre-extracted ttl/cache def localPropertyTypeExtractor(self): query_str = "SELECT DISTINCT ?p WHERE { ?s ?p ?o . } " query_object = prepareQuery(query_str)#, initNs={CMR_QA.NAMESPACE_PREFIX : CMR_QA.BASE_URI}) predicates = self.localrdfgraph.query(query_object) print("Using %s local predicates" % (len(predicates))) for p in predicates: #print(p) #continue prop = str(p[0]) #print(prop) if not prop.startswith("http://dbpedia.org/"): #we ignore other type of properties. Focus on dbpedia ones. #Others will be trreates as annotation (rdfs:label, foaf:name) or specially (rdf:type) continue query_str = "SELECT ?value WHERE { ?s <" + prop + "> ?value . } limit 100" #print(query_str) #continue #print("lalala") query_object = prepareQuery(query_str)#, initNs={CMR_QA.NAMESPACE_PREFIX : CMR_QA.BASE_URI}) values = self.localrdfgraph.query(query_object) n_values = len(values) n_uris = 0 for v in values: #print(v[0]) if str(v[0]).startswith("http"): n_uris+=1 if n_values==1: isObjectProperty = (n_uris == n_values) else: isObjectProperty = (n_uris > (n_values/2)) #print("New: " + prop) if isObjectProperty: #self.rdfgraph.add( (URIRef(prop), RDF.type, URIRef(OWL.OWLOBJECTPROPERTY)) ) self.propertyType[prop]=True else: #self.rdfgraph.add( (URIRef(prop), RDF.type, URIRef(OWL.OWLDATAPROPERTY)) ) self.propertyType[prop]=False
def __init__(self): self.smartlookup = Lookup() self.smartendpoint = Endpoint() self.dbpedia_ep = DBpediaEndpoint()
class JSONUtilities(object): def __init__(self): self.smartlookup = Lookup() self.smartendpoint = Endpoint() self.dbpedia_ep = DBpediaEndpoint() def validateEntityToClasses(self, path, file_in, file_out): with open(path + file_in) as f: data = json.load(f) data_new = dict() no_types = 0 empty_ref = 0 missing_cases = 0 wrong_cases = 0 empty_cases = 0 tmp_f = open(path + file_out.replace('.json', '') + '.csv', 'w') tmp_f2 = open(path + file_out.replace('.json', '') + '_issues.csv', 'w') for entity in data: types_tocheck = set(data[entity]) types_ref = self.smartlookup.getTypesForEntity(entity, KG.DBpedia) if is_empty(types_ref): if is_empty(types_tocheck): #Some issues with disambiguation pages no_types += 1 else: ##Solved! empty_ref += 1 #Some uris are redirects... #We use the original types data_new[entity] = data[entity] tmp_f.write('%s,%s\n' % (entity, ",".join(types_tocheck))) continue #New set of corrected types data_new[entity] = list(types_ref) #json expects a list tmp_f.write('%s,%s\n' % (entity, ",".join(types_ref))) #print("Checking", entity, len(types_ref), len(types_tocheck)) #print("Checking %s: %s vs %s" % (entity, types_ref, types_tocheck)) #Statistics missing = types_ref.difference(types_tocheck) wrong = types_tocheck.difference(types_ref) if len(missing) > 0 or len(wrong) > 0: print("Issues with: " + entity) if len(missing) > 0: print("\tMissing types: ", missing) missing_cases += 1 if len(types_tocheck) == 0: empty_cases += 1 if len(wrong) > 0: print("\tWrong types", wrong) wrong_cases += 1 tmp_f2.write("Entity,%s.\nMising,%s\nWrong:%s\n" % (entity, ",".join(missing), ",".join(wrong))) #We save the new types self.dumpJsonFile(data_new, path + file_out) tmp_f2.write("Cases with wrong types: %s\n" % (str(wrong_cases))) tmp_f2.write("Cases with missing types: %s\n" % (str(missing_cases))) tmp_f2.write("Cases with empty types : %s\n" % (str(empty_cases))) tmp_f2.write("Cases with empty new types: %s\n" % (str(empty_ref))) tmp_f2.write("Cases with no types at all: %s\n" % (str(no_types))) tmp_f.close() tmp_f2.close() print("Cases with wrong types: " + str(wrong_cases)) print("Cases with missing types: " + str(missing_cases)) print("Cases with empty types: " + str(empty_cases)) print("Cases with empty new types: " + str(empty_ref)) print("Cases with no types at all: " + str(no_types)) def createTriplesForClasses(self, path, class_file_r, class_file_s, file_out): tmp_f = open(path + file_out.replace('.json', '') + '.csv', 'a+') #Read candidate classes classes = set() e_classes = json.load(open(path + class_file_r)) for c_list in e_classes.values(): for c in c_list: classes.add(c) #print(len(classes)) e_classes = json.load(open(path + class_file_s)) for c_list in e_classes.values(): for c in c_list: classes.add(c) #print(len(classes)) #Play with different numbers depending on the cost.... #For each class extract 50-100-200 entities #Tests #entities = self.smartendpoint.getEntitiesForDBPediaClass("http://dbpedia.org/ontology/BaseballTeam", 100) #for e, label in entities.items(): # print(e, list(label)[0]) #classes = set() #Dict to convert to jason #class_triples = dict() cache_file = path + file_out class_triples = json.load( open(cache_file)) if os.path.exists(cache_file) else dict() print("Class triples initial size", str(len(class_triples))) for c_uri in classes: print(c_uri) if c_uri in class_triples: #already analysed/cached print("\tAlready cached!") continue #if len(class_triples)>5: # break i = time.time() tmp_f.write('%s\n' % (c_uri)) #Dictionary entity-label entities = self.smartendpoint.getEntitiesForDBPediaClass( c_uri, 500) #For each above entity (?o) extract triples ?s ?p ?o, together with the label of ?o #Extract from 10-50 triples for entity, filter ?p NOT IN: show top ones we aim at discard triples = list() for object_uri in entities: ''' ''' #label label = list(entities[object_uri])[0] #Triples for object entity subjects_predicates = self.dbpedia_ep.getTriplesForObject( object_uri, 50) for subject in subjects_predicates: for predicate in subjects_predicates[subject]: triple = [subject, predicate, object_uri, label] triples.append(triple) tmp_f.write('%s\n' % (",".join(triple))) #end for entities print("\tTriples", len(triples)) class_triples[c_uri] = triples #We dump, so that if it we breaks we can continue from there self.dumpJsonFile(class_triples, path + file_out) e = time.time() print("Time:", e - i) #end for classes #We save the new triples tmp_f.close() print(len(class_triples), path + file_out) self.dumpJsonFile(class_triples, path + file_out) #TBC def validateClassTriples(self, file): with open(file) as f: data = json.load(f) predicate_count = dict() n_triples = 0 empty_entries = 0 for entity in data: subjects = set() predicates = set() objects = set() print(entity, len(data[entity])) if len(data[entity]) == 0: empty_entries += 1 n_triples += len(data[entity]) n_triples_class = 0 for triple in data[entity]: if triple[1] in URI_KG.avoid_predicates: continue if not triple[1].startswith( URI_KG.dbpedia_uri) and not triple[1].startswith( URI_KG.dbpedia_uri_property): continue n_triples_class += 1 subjects.add(triple[0]) if triple[1] not in predicate_count: predicate_count[triple[1]] = 0 predicate_count[triple[1]] += 1 predicates.add(triple[1]) objects.add(triple[2]) #print("\t",data[entity][0]) print( "\t Different Triples, Subjects, predicates, objects: %s, %s, %s, %s" % (str(n_triples_class), str(len(subjects)), str(len(predicates)), str(len(objects)))) print("Empty entries", empty_entries) predicate_count_sorted = OrderedDict( sorted(predicate_count.items(), key=lambda x: x[1])) #for k, v in predicate_count_sorted.items(): # print(k, v) print(len(data), n_triples) def dumpJsonFile(self, data_json, file): with open(file, "w") as write_file: json.dump(data_json, write_file)
class Endpoint(object): ''' This class aim at identifying errors in DBpedia ENDPOINT when retrieving samples for training Positive/negative samples for candidate classes ''' ''' def queryTripleByClass(top_k, c): triples = list() s = sparql.Service(SPARQL_END_POINT, "utf-8", "GET") statement = 'select distinct str(?s), str(?p), str(?o), str(?l) where {?s ?p ?o. ?o rdf:type <%s>. ' \ '?o rdfs:label ?l. FILTER( langMatches(lang(?l), "en"))} ORDER BY RAND() limit %d' % (c, top_k) result = s.query(statement) for row in result.fetchone(): triples.append([row[0], row[1], row[2], row[3]]) return triples ''' def __init__(self): ''' Constructor ''' self.dbpedia_ep = DBpediaEndpoint() self.wikidata_ep = WikidataEndpoint() self.lookup = Lookup() def __analyseEntityPredicateStrategy(self, ent, cls_uri): ''' Analyses correctness of cls_uri as type of ent using Predicate types strategy ''' predicate_types = self.lookup.__getTypesPredicateStrategy(ent) if len(predicate_types) == 0: return None if cls_uri in predicate_types: return True return False def __analyseEntityLooukStrategy(self, ent, cls_uri): ''' Analyses correctness of cls_uri as type of ent using Look-up types ''' #Note that if not look-up types, then we return the sparql types as they are ##IF lookup types, the sparql types must be compatible clean_lookup_types = self.lookup.getTypesForEntity(ent, KG.DBpedia) if len(clean_lookup_types) == 0: return None if cls_uri in clean_lookup_types: return True return False def __analyseEntityWikidataStrategy(self, ent, cls_uri, wikidata_classes): ''' Analyses correcteness of cls_uri as type of ent using wikidata ''' #b. Get equivalent wikidata entity (if any) same_entities = self.dbpedia_ep.getSameEntities(ent) wikidata_entities = getFilteredResources( same_entities, KG.Wikidata) ##typically one entity ##If no equivalent entities we then go for the lookup strategy if len(wikidata_entities) == 0: return self.__analyseEntityLooukStrategy(ent, cls_uri) #print(wikidata_entities) for wk_ent in wikidata_entities: #c. Check if wikidata type from (a) is within types of equivalent entity from (b) #print("\t"+wk_ent) wk_ent_types = self.wikidata_ep.getAllTypesForEntity( wk_ent) #we consider supertypes to extend compatibility time.sleep(0.01) #to avoid limit of calls intersect = wk_ent_types.intersection(wikidata_classes) if len(intersect) > 0: return True return False def getEntitiesForDBPediaClass(self, cls_uri, limit=1000): ''' It currently expects a URL from DBpedia ''' ##We query a subset of entities for sampling clean_db_entities = dict() offset = 0 #To guarantee the required number of (clean) entities for the class while len(clean_db_entities) < limit: #db_entities = self.dbpedia_ep.getEntitiesForType(cls_uri, offset*limit*5, limit*5) #We extract more than required as many of them will be noisy db_entities = self.dbpedia_ep.getEntitiesLabelsForType( cls_uri, offset * limit * 5, limit * 5) #print("Entities",len(db_entities)) #For wikidata strategy #a. Get equivalent class from wikidata (if any) db_eq_cls = self.dbpedia_ep.getEquivalentClasses(cls_uri) wikidata_classes = getFilteredTypes( db_eq_cls, KG.Wikidata) ##typically one class filtered_look_up = 0 filtered_wikidata = 0 filtered_predicates = 0 for ent in db_entities: if len(clean_db_entities) >= limit: print("%d, %d, %d, %d" % (len(clean_db_entities), filtered_look_up, filtered_predicates, filtered_wikidata)) return clean_db_entities results_look_up = self.__analyseEntityLooukStrategy( ent, cls_uri) if results_look_up == None: results_predicates = self.__analyseEntityPredicateStrategy( ent, cls_uri) if results_predicates == None: if self.__analyseEntityWikidataStrategy( ent, cls_uri, wikidata_classes ): #wikidata strategy (it is very costly) clean_db_entities[ent] = db_entities[ent] else: filtered_wikidata += 1 #print ("Entity filtered by wikidata", ent) elif results_predicates: #passed predicates strategy clean_db_entities[ent] = db_entities[ent] else: #print ("Entity filtered by predicates", ent) filtered_predicates += 1 elif results_look_up: #passed look-up strategy clean_db_entities[ent] = db_entities[ent] else: #print ("Entity filtered by look-up", ent) filtered_look_up += 1 #OLD STRATEGY: too slow #if (len(wikidata_classes)==0): ## No wikidata class then look-up strategy # for ent in db_entities: # if len(clean_db_entities)>=limit: # return clean_db_entities # if self.__analyseEntityLooukStrategy(ent, cls_uri): # clean_db_entities.add(ent) #else: # for ent in db_entities: # if len(clean_db_entities)>=limit: # return clean_db_entities # if self.__analyseEntityWikidataStrategy(ent, cls_uri, wikidata_classes): # clean_db_entities.add(ent) #print(len(clean_db_entities)) offset += 1 #Limit of iterations if offset > 5: print("%d, %d, %d, %d" % (len(clean_db_entities), filtered_look_up, filtered_predicates, filtered_wikidata)) return clean_db_entities print("%d, %d, %d, %d" % (len(clean_db_entities), filtered_look_up, filtered_predicates, filtered_wikidata)) return clean_db_entities
class Lookup(object): ''' This class aim at providing a lookup access to the KG leading to minimal errors It will also optionally combine. No only one KG but several ''' def __init__(self): #KGraph=KG.DBpedia ''' Constructor ''' #Return types from this knowledge graph #self.KGraph = KGraph self.dbpedia_onto = DBpediaOntology() self.dbpedia_onto.loadOntology(True) self.schema_onto = SchemaOrgOntology() self.schema_onto.loadOntology(True) self.dbpedia_ep = DBpediaEndpoint() self.wikidata_ep = WikidataEndpoint() def getTypesForEntity(self, uri_entity, kg=KG.DBpedia): #print("QUERY",uri_entity) if kg == KG.DBpedia: types = set() types_redirects = set() #Original entity types.update(self.getTypesForDBPediaEntity(uri_entity)) #Redirects if any #See dbo:wikiPageRedirects -> similar to same_as inside dbpedia redirects = self.dbpedia_ep.getWikiPageRedirect(uri_entity) for uri_redirect in redirects: #Typically only one types_redirects.update( self.getTypesForDBPediaEntity(uri_redirect)) if len(types) == 0: #We use the ones of the redirects types.update(types_redirects) else: #types of redirects can be dirty for t in types_redirects: if self.__checkCompatibilityTypes(t, types): types.add(t) #Commented because it was slow for a large dataset #if is_empty(types) or (len(types)==1 and "Agent" in list(types)[0]): # #Wikidata strategy to complement if empty endpoint and look-up or only type "Agent" # print(types) # types.update(self.__getTypesWikidataStrategy(uri_entity)) # print(types) return types #TBC elif kg == KG.Wikidata: pass elif kg == KG.Google: pass return set() def __getTypesLookupStrategy(self, uri_entity): kg = KG.DBpedia label = uri_entity if uri_entity.startswith(URI_KG.dbpedia_uri_resource): label = uri_entity.replace(URI_KG.dbpedia_uri_resource, '') ##we call our method to get look-up types for the URI. Only SPARQL endpoint types may contain errors #It also includes wikidata strategy inside entities = self.getKGEntities(label, 10, uri_entity) #fulter by uri_entity ##In case not match in look up if is_empty(entities): types = set() #Dbpedia Enpoint strategy types_endpoint = getFilteredTypes( self.dbpedia_ep.getAllTypesForEntity(uri_entity), KG.DBpedia) #Predicates strategy (uses top types) types_domain_range = self.__getTypesPredicateStrategy(uri_entity) if len(types_domain_range) > 0: #They can be noisy, so do not add them yet #types.update(types_domain_range) ##Check compatibility of types_endpoint for t in types_endpoint: if t not in types_domain_range: if self.__checkCompatibilityTypes( t, types_domain_range): types.add(t) #If no compatible types we just use the ones coming from domain/ranges if len(types) > 0: types.update(types_domain_range) #If still empty we use if len(types) == 0: #We add endpoint types types.update(types_endpoint) return types else: ##shoudl be only one element from lookup for entity in entities: return entity.getTypes(kg) def __getTypesPredicateStrategy(self, uri_entity): ''' Exploits the domain and range types of the predicates in triples with uri_entity as subject or object ''' types = set() #Top-2 types.update( getFilteredTypes( self.dbpedia_ep.getTopTypesUsingPredicatesForObject( uri_entity, 2), KG.DBpedia)) #We use top one here, as there less properties associated to an entity. And We only need one wrong to be in the top-k #Top-1 #Error-prone as many properties are not properly used #Only uses if current range types are compatible types_domain = getFilteredTypes( self.dbpedia_ep.getTopTypesUsingPredicatesForSubject( uri_entity, 1), KG.DBpedia) #if len(types)==0: # types.update(types_domain) #else: if len(types) > 0: for t in types_domain: if self.__checkCompatibilityTypes(t, types): types.add(t) #TODO: Alternative Strategies: use intersection of types_range and types_domain in non empty. May increase recall (remove min input/output edges in queries) and precision #If empty then use as now. return types def __getTypesWikidataStrategy(self, uri_entity): print("\tUsing wikidata strategy for " + uri_entity) #Gets equivalent wikidata entities same_entities = self.dbpedia_ep.getSameEntities(uri_entity) wikidata_entities = getFilteredResources( same_entities, KG.Wikidata) ##typically one entity wk_ent_types = set() dp_types = set() dp_types_all = set() if len(wikidata_entities) == 0: return wk_ent_types for wk_ent in wikidata_entities: #print("WK ent: "+wk_ent) #Get types for wikidata entities wk_ent_types.update(self.wikidata_ep.getAllTypesForEntity( wk_ent)) #we consider all supertypes to extend compatibility #Problematic concept #Wikimedia disambiguation page if URI_KG.wikimedia_disambiguation_concept in wk_ent_types: wk_ent_types.clear() #Check if: wk_ent_types for t in wk_ent_types: #print("WK cls: " +t) #Get equivalent dbpedia types #print(self.wikidata_ep.getEquivalentClasses(t)) dp_types.update( getFilteredTypes(self.wikidata_ep.getEquivalentClasses(t), KG.DBpedia)) #get superclasses for t in dp_types: #print("DBp type: " +t) dp_types_all.update(self.dbpedia_ep.getAllSuperClasses(t)) return getFilteredTypes(dp_types_all, KG.DBpedia) def getTypesForDBPediaEntity(self, uri_entity): #types=set() #Types from DBpedia Endpoint may be dirty. So we use 2 strategies: wikidata and lookup types #TODO: check compatibility among strategies? #Look-up strategy also includes wikidata strategy types = self.__getTypesLookupStrategy(uri_entity) #print("Main", types) return types ##Additional strategy... #Check equivalent entity from wikidata, get classes from wikidata, get equivalent in dbpedia enpoint def getKGEntities(self, cell, limit=5, filter=''): ''' Given the text of a cell extracts entity objects. Note that an entity contains an id, a label, a description, a set of types from dbpedia, wikidata and schema.org, and the source (dbpedia, wikidata or google) ''' #Strategy: #0. Incremental repair: start with sth simple #1. Identify cases where dbpedia lookup and endpoint do not agree #2. We have dbpedia (may require a fix...) and schema.org taxonomies to identify conflicting branches #3. Use alignment to identify same entities and have more evidence about types #3a. Lexical #3b. Based on embeddings #4. If not provided mapping among classes (there seem to be available mappings), we may use alignment as well (lexical and embedding) query = cell #Get KG entities from DBpedia, Wikidata and KG #One could also return the most accurate 5 types combining the 3 KGs... (limit 20 of each of them and then retrieve top-k) dbpedia = DBpediaLookup() dbpedia_entities = dbpedia.getKGEntities(query, limit, filter) #We complement with types from endpoint and check if they are correct/compatible for entity in dbpedia_entities: self.__analyseEntityTypes(entity) return dbpedia_entities #Next steps #Find equivalent entities from wikidata (using both wikidata and dbpedia endpoints), #then its types and then try to find conflictive types (it could even be by voting) ''' kg = GoogleKGLookup() wikidata = WikidataAPI() ''' def __analyseEntityTypes(self, entity): #print(entity.getId()) #print("\t"+str(entity.getTypes(KG.DBpedia))) #Filter by type? types_endpoint = getFilteredTypes( self.dbpedia_ep.getAllTypesForEntity(entity.getId()), KG.DBpedia) #print("\t"+str(types_endpoint)) if len(entity.getTypes()) > 0: for t in types_endpoint: if t not in entity.getTypes(): ##Evaluate compatibility with lookup types. ##In same branch ##We use DBpedia for now if self.__checkCompatibilityTypes( t, entity.getTypes(KG.DBpedia)): entity.addType(t) else: #No types from lookup #We use wikidata strategy #Not great for compatibility as we need to better explore the returned types #types_wk_strategy = self.__getTypesWikidataStrategy(entity.getId()) #We use range-domain-predicate strategy (uses top-types) types_domain_range = self.__getTypesPredicateStrategy( entity.getId()) if len(types_domain_range) > 0: #They can be noisy, so do not add them yet #entity.addTypes(types_domain_range) ##Check compatibility of types_endpoint for t in types_endpoint: if t not in types_domain_range: if self.__checkCompatibilityTypes( t, types_domain_range): entity.addType(t) #DANGEROUS, as domain and range types contain amny errors #If no compatible type we just use the ones coming from domain/ranges #if len(entity.getTypes())>0: # entity.addTypes(types_domain_range) #If still empty we use endpoint if len(entity.getTypes()) == 0: #We add endpoint types entity.addTypes(types_endpoint) ##Last resource if not types if len(entity.getTypes()) > 0: entity.addTypes(types_domain_range) #We complement with wikidata strategy #entity.addTypes(types_wk_strategy) #print("\t"+str(entity.getTypes(KG.DBpedia))) ''' We check if the source type (endpoint) is among descendants or ancestors of at least one of the target types (lookup) ''' def __checkCompatibilityTypes(self, cls_source_uri, target_types): for cls_target_uri in target_types: if self.__isCompatibleType(cls_source_uri, cls_target_uri): return True return False ''' We check if the source type is among descendants or ancestors of the target type ''' def __isCompatibleType(self, cls_source_uri, cls_target_uri): cls_source = self.dbpedia_onto.getClassByURI(cls_source_uri) cls_target = self.dbpedia_onto.getClassByURI(cls_target_uri) ##TODO We rely on DBpedia only for now if cls_source == None or cls_target == None: return False ancestors = self.dbpedia_onto.getAncestorsURIs(cls_target) descendants = self.dbpedia_onto.getDescendantURIs(cls_target) if cls_source_uri in ancestors or cls_source_uri in descendants: return True return False
''' author: Eleanor Bill @eljne ''' ''' create vectors for additional training data - +ve - CONTINUED''' ''' takes about four-five hours w/10 samples per question - 180,000 ''' from kg.EB_classes import pickl, unpickle, nouns_list, noun_phrases_list import pandas as pd from kg.endpoints import DBpediaEndpoint pos = unpickle('training_vectors/11_train_new_positive_samples') new_positive_samples = pd.DataFrame(pos) print('unpickled') ep = DBpediaEndpoint() # print(new_positive_samples.head) def get_nouns(entity): labels = ep.getEnglishLabelsForEntity(entity) nouns = nouns_list(labels) print('.') return nouns def get_nps(entity): labels = ep.getEnglishLabelsForEntity(entity) nps = noun_phrases_list(labels) print('..') return nps def apply_endpoint_alt(entity): # find types types = ep.getTypesForEntity(entity) # limit to 5