Python DBpediaEndpoint примеры использования

Язык программирования: Python

Пространство имен/Пакет: kg.endpoints

Класс/Тип: DBpediaEndpoint

Примеров на hotexamples.com: 14

Python DBpediaEndpoint - 14 примеров найдено. Это лучшие примеры Python кода для kg.endpoints.DBpediaEndpoint, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

DBpediaEndpoint(10)

getSameEntities(3)

getTypesForEntity(2)

getWikiPageRedirect(2)

getAllSuperClasses(1)

getAllTypesForEntity(1)

getEntitiesForType(1)

getEntitiesLabelsForType(1)

getEquivalentClasses(1)

getSomeValuesForPredicate(1)

getTopTypesUsingPredicatesForObject(1)

getTopTypesUsingPredicatesForSubject(1)

getTriplesForObject(1)

getTriplesForSubject(1)

getWikiPageRedirectFrom(1)

Пример #1

Показать файл

    def __init__(self):
        '''
        Constructor
        '''

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()

        self.lookup = Lookup()

Пример #2

Показать файл

def apply_endpoint(entity):  # find types
    # print(entity)
    ep = DBpediaEndpoint()  # using ID/int
    ent2 = entity.getIdstr()
    types = ep.getTypesForEntity(ent2)  # limit to 5
    # print('types using endpoint id', types)
    if len(types) == 0:  # using entity: back up
        types = entity.getTypes()  # ont
        # print('types using entity', types, '\n')
    return types

Пример #3

Показать файл

 def __init__(self):
     '''
     Constructor
     '''
     #Set up
     #self.setUpRDFGraph()
     self.entities = set()
     self.types = set()
     #prop: True -> isObjectProperty
     self.propertyType = dict()
     
     
     self.dbp_endpoint = DBpediaEndpoint()        
     self.lookup = DBpediaLookup()

Пример #4

Показать файл

    def __init__(self):  #KGraph=KG.DBpedia
        '''
        Constructor
        '''
        #Return types from this knowledge graph
        #self.KGraph = KGraph

        self.dbpedia_onto = DBpediaOntology()
        self.dbpedia_onto.loadOntology(True)
        self.schema_onto = SchemaOrgOntology()
        self.schema_onto.loadOntology(True)

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()

Пример #5

Показать файл

def apply_endpoint_list(entity_list):  # find types
    types_list = []
    ep = DBpediaEndpoint()  # using ID/int
    for entity in entity_list:
        print(entity)
        # ['N/A']
        # < id: http://dbpedia.org/resource/Ana_Popović, label: Ana Popović, description: None, types: set(), source: DBpedia >
        if entity != ['N/A']:
            ent2 = entity.getIdstr()
            types = ep.getTypesForEntity(ent2)  # limit to 5
            # print('types using endpoint id', types)
            types_list.append(types)
            if len(types) == 0:  # using entity: back up
                types = entity.getTypes()  # ont
                # print('types using entity', types, '\n')
                types_list.append(types)
        else:
            types = []
            types_list.append(types)
    return types_list

Пример #6

Показать файл

def tablesToChallengeFormat(folder_gt, folder_tables, file_out_gt,
                            file_out_redirects_gt, file_out_gt_target,
                            max_tables):

    #csv_file_names = [f for f in listdir(folder_gt) if isfile(join(folder_gt, f))]

    csv_file_names = []
    csv_file_names.append("2014_Tour_of_Turkey#6.csv")

    f_out = open(file_out_gt, "w+")
    f_out_redirects = open(file_out_redirects_gt, "w+")
    f_out_target = open(file_out_gt_target, "w+")

    n_tables = 0
    wrong_entities = 0
    panda_errors = 0

    dbpedia_ep = DBpediaEndpoint()

    for csv_file_name in csv_file_names:

        #print(csv_file_name)

        with open(join(folder_gt, csv_file_name)) as csv_file:

            try:
                #Try to open with pandas. If error, then discard file
                pd.read_csv(join(folder_tables, csv_file_name),
                            sep=',',
                            quotechar='"',
                            escapechar="\\")
                #df = csv.reader(csv_file)
            except:
                panda_errors += 1
                continue

            table_id = csv_file_name.replace(".csv", "")

            csv_reader = csv.reader(csv_file,
                                    delimiter=',',
                                    quotechar='"',
                                    escapechar="\\")

            for row in csv_reader:

                #URI, text, row number
                #http://dbpedia.org/resource/Ahmet_%C3%96rken, Ahmet A\u0096rken, 1
                if len(row) < 3:
                    continue

                entity_uri = row[0]
                row_id = row[2]

                entity_mention = row[1]

                column_id = getColumnEntityMention(
                    join(folder_tables, csv_file_name), entity_mention)

                entities = set()
                new_entities = set()

                ##Consider redirects:
                #entities.update(dbpedia_ep.getWikiPageRedirect(entity_uri))
                #entities.update(dbpedia_ep.getWikiPageRedirectFrom(entity_uri))

                #for e in entities:
                #    new_entities.update(dbpedia_ep.getWikiPageRedirect(e))
                #    new_entities.update(dbpedia_ep.getWikiPageRedirectFrom(e))

                entities.add(entity_uri)
                #entities.update(new_entities)

                if column_id >= 0:
                    #Output
                    #table id,column id, row id and DBPedia entity
                    #9206866_1_8114610355671172497,0,121,http://dbpedia.org/resource/Norway
                    line_str = '\"%s\",\"%s\",\"%s\"' % (table_id, column_id,
                                                         row_id)
                    f_out_target.write(line_str + '\n')

                    #f_out.write('\"%s\",\"%s\",\"%s\",\"%s\"\n' % (table_id, column_id, row_id, entity_uri))

                    f_out.write(line_str + ',\"%s\"\n' % entity_uri)

                    #for ent in entities:
                    #    line_str += ',\"'+ ent + '\"'
                    line_str += ',\"' + " ".join(entities) + '\"'

                    f_out_redirects.write(line_str + '\n')

                    #TODO
                    #Read with pandas: https://www.datacamp.com/community/tutorials/pandas-read-csv
                    #There are some errors with "\"
                    #writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
                    #writer.writerow()

                    #print('\"%s\",\"%s\",\"%s\",\"%s\"' % (table_id, column_id, row_id, entity_uri))
                else:
                    wrong_entities += 1

        ##Small dataset with only approx. 20k tables out of >400k
        if n_tables > max_tables:  #200000
            break
        n_tables += 1

    print("Panda errors: %d" % panda_errors)
    print("Wrong entities: %d" % wrong_entities)
    f_out.close()
    f_out_redirects.close()
    f_out_target.close()

Пример #7

Показать файл

def extensionWithWikiRedirects(file_gt, folder_tables, file_out_gt,
                               file_out_redirects_gt, file_out_gt_target,
                               max_rows):
    #print(csv_file_name)

    f_out = open(file_out_gt, "a+")
    f_out_redirects = open(file_out_redirects_gt, "a+")
    f_out_target = open(file_out_gt_target, "a+")

    n_rows = 0
    panda_errors = 0

    dbpedia_ep = DBpediaEndpoint()

    table_id = ""

    dict_entities = dict()

    #READ CURRENT CACHE
    #init dict_entities with current state of file_out_redirects_gt
    with open(file_out_redirects_gt) as csv_file_redirections:

        csv_reader = csv.reader(csv_file_redirections,
                                delimiter=',',
                                quotechar='"',
                                escapechar="\\")

        #"1","0","1","http://dbpedia.org/resource/Uno_Von_Troil http://dbpedia.org/resource/Uno_von_Troil"
        for row in csv_reader:

            entity_list = row[3].split(" ")

            for e in entity_list:

                if e not in dict_entities:
                    dict_entities[e] = set(entity_list)

    with open(file_gt) as csv_file:

        csv_reader = csv.reader(csv_file,
                                delimiter=',',
                                quotechar='"',
                                escapechar="\\")

        for row in csv_reader:

            #file, col, row, URI
            #note that in Okties GT it is given file, row, col
            #1,1,0,http://dbpedia.org/resource/Uno_von_Troil

            if len(row) < 4:
                continue

            #entity_uri = row[3]
            entity_uri = row[3].replace("\"", "%22")

            #To avoid cases from "http://sws.geonames.org/"
            #if entity_uri.startswith("http://sws.geonames.org/"):
            same_as_resources = set()
            if not entity_uri.startswith("http://dbpedia.org/resource/"):
                #print(getFilteredResources(dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia))
                same_as_resources = getFilteredResources(
                    dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia)
                #print(row[0])
                #print(row[1])
                #print(row[2])
                #print(entity_uri)

                if len(same_as_resources) == 0:
                    print("No dbpedia entity for: %s, %s, %s, %s" %
                          (row[0], row[1], row[2], entity_uri))
                else:
                    #We keep only one of the same_as dbpedia resoruces
                    for r in same_as_resources:
                        entity_uri = r

                    ##We will consider other same as later
                    same_as_resources.remove(entity_uri)

                #break

            entity_uri = row[3].replace("\"", "%22")

            #if int(row[0])<1000: #Jiaoyan starts from table file 1,000
            #if int(row[0])>=1000: #ernesto
            #if int(row[0])>=3: #ernesto
            #if int(row[0])<587 or int(row[0])>=1000:
            #    continue

            if not table_id == row[0]:

                #Change of table we close and then reopen again to keep a better storage of intermediate points
                f_out.close()
                f_out_redirects.close()
                f_out_target.close()

                table_id = row[0]
                print(table_id)

                f_out = open(file_out_gt, "a+")
                f_out_redirects = open(file_out_redirects_gt, "a+")
                f_out_target = open(file_out_gt_target, "a+")

            col_id = row[2]  #Reverse according to input
            row_id = row[1]

            csv_file_name = table_id + ".csv"

            try:
                #Try to open with pandas. If error, then discard file
                pd.read_csv(join(folder_tables, csv_file_name),
                            sep=',',
                            quotechar='"',
                            escapechar="\\")
                #df = csv.reader(csv_file)
            except:
                panda_errors += 1
                continue

            entities = set()

            ##Keep and index to avoid unnecessary queries
            if entity_uri in dict_entities:
                entities.update(dict_entities[entity_uri])
            else:
                #entities=set()
                new_entities = set()

                ##Consider redirects:
                entities.update(dbpedia_ep.getWikiPageRedirect(entity_uri))
                entities.update(dbpedia_ep.getWikiPageRedirectFrom(entity_uri))
                entities.update(
                    same_as_resources)  #in case there were more than one

                #two iterations
                for e in entities:
                    new_entities.update(dbpedia_ep.getWikiPageRedirect(e))
                    new_entities.update(dbpedia_ep.getWikiPageRedirectFrom(e))

                entities.add(entity_uri)
                entities.update(new_entities)

                dict_entities[entity_uri] = set()
                dict_entities[entity_uri].update(entities)

            #Output
            #table id, column id, row id and DBPedia entity
            #9206866_1_8114610355671172497,0,121,http://dbpedia.org/resource/Norway
            line_str = '\"%s\",\"%s\",\"%s\"' % (table_id, col_id, row_id)
            f_out_target.write(line_str + '\n')

            #f_out.write('\"%s\",\"%s\",\"%s\",\"%s\"\n' % (table_id, column_id, row_id, entity_uri))

            f_out.write(line_str + ',\"%s\"\n' % entity_uri)

            #for ent in entities:
            #    line_str += ',\"'+ ent + '\"'
            line_str += ',\"' + " ".join(entities) + '\"'

            f_out_redirects.write(line_str + '\n')

            ##Number of rows
            if n_rows > max_rows:  #200000
                break
            n_rows += 1

    print("Panda errors: %d" % panda_errors)
    f_out.close()
    f_out_redirects.close()
    f_out_target.close()

Пример #8

Показать файл

from ontology.onto_access import DBpediaOntology

# unpickle
# load = unpickle('training_vectors/final_original_training_vectors') # when we have training data from task to eval
load = unpickle('training_vectors/final_original_training_vectors_minus_tests'
                )  # created own testing data from splitting train
df_positive = pd.DataFrame(load)
df_positive['polarity'] = "1"
'''create more positive samples
do this by:
- getting a different but similar entity using SPARQLEndpoint
'''

onto_access = DBpediaOntology()
onto_access.loadOntology(True)
ep = DBpediaEndpoint()  # getEntitiesForType


def get_alt_entities(entity_types):
    lis = []
    for ls in entity_types:
        # print('ls', ls)
        enty = get_last(ls)  # only get finest entity
        # print('entity:', enty)
        try:
            # simty = ep.getEntitiesForDBPediaClass(enty, 100) - slower version
            simty = ep.getEntitiesForType(enty, 0, 10)
            lis.append(simty)
            # print('similar entity', simty)
        except:
            pass

Пример #9

Показать файл

class DBPediaExtractor(object):
    '''
    classdocs
    '''

    #Steps
    # Read entities from GT, include redirections and add same_as axioms
    # Query for types
    # Query for 1 level of relationships, filter those non in dbpedia ontology

    def __init__(self):
        '''
        Constructor
        '''
        #Set up
        #self.setUpRDFGraph()
        self.entities = set()
        self.types = set()
        #prop: True -> isObjectProperty
        self.propertyType = dict()
        
        
        self.dbp_endpoint = DBpediaEndpoint()        
        self.lookup = DBpediaLookup()
        
        
        
        
    
    
    def isValidURI(self, str_uri):
        
        #use term._is_valid_unicode(str_uri)
        
        return term._is_valid_uri(str_uri) and self.isascii(str_uri)
    
    
    def isascii(self, string_original):
        
        string = self.strip_accents(string_original)  #to ignore accents 
            
        return len(string_original) == len(string.encode())
    
    
    
    def strip_accents(self, text):
            
        text = unicodedata.normalize('NFD', text)\
               .encode('ascii', 'ignore')\
               .decode("utf-8")
    
        return str(text)    
    
    
    #Precomputed
    def setUpLocalDBPediaGraph(self, file_ttl):
        self.localrdfgraph = Graph()
        self.localrdfgraph.parse(source=file_ttl, format='turtle')
        
    
    
    #To be computed
    def setUpRDFGraph(self):
        self.rdfgraph = Graph()
        #self.rdfgraph.bind(TabularToRDF.NAMESPACE_PREFIX, TabularToRDF.BASE_URI)
        self.rdfgraph.bind("foaf", "http://xmlns.com/foaf/0.1/")
        self.rdfgraph.bind("dbp", "http://dbpedia.org/property/")
        self.rdfgraph.bind("dbr", "http://dbpedia.org/resource/")
        self.rdfgraph.bind("dbo", "http://dbpedia.org/ontology/")
        self.rdfgraph.bind("owl", "http://www.w3.org/2002/07/owl#")
        
        
    
    
    def saveRDFGrah(self, rdf_file_ouput):
        #output same table name as ttl
        self.rdfgraph.serialize(str(rdf_file_ouput), format='turtle')#xml,turtle
        
        
        wrong_file_name = ""
        
        try:
        
            if "?" in rdf_file_ouput:
                wrong_file_name = rdf_file_ouput.split("?")[0]
                os.rename(wrong_file_name, rdf_file_ouput)
            elif "#" in rdf_file_ouput:
                wrong_file_name = rdf_file_ouput.split("#")[0]
                os.rename(wrong_file_name, rdf_file_ouput)
            
                
            #print(wronf_file_name)
        except:
            print(wrong_file_name, rdf_file_ouput)



    def getTargetEntitiesCEA(self, cea_file):
        
        #Target entiteis for table
        self.targetEntities = dict()
        
        with open(cea_file) as csv_file:
            
            csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
            
            for row in csv_reader:
                
                if len(row)<4:
                    continue
                
                
                uris = row[3].split(" ")
                
                #entities per table
                key = row[0] #+ "-"+ row[1] + "-"+ row[2]
                
                if key not in self.targetEntities:  
                    self.targetEntities[key]=set()
                    
                self.targetEntities[key].update(uris)
                
                
        
    
    
    
    def getEntitiesAndCreateInstancesTable(self, table_name):
        
        if table_name in self.targetEntities:
        
            for ent in self.targetEntities[table_name]:
                
                if self.isValidURI(ent):
                    self.entities.add(ent)
                    e_uri = URIRef(ent)                    
                    self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) )
                #else:
                #    pass
            
            
            
        
    

    def getEntitiesAndCreateInstances(self, cea_file):
        
        with open(cea_file) as csv_file:
            
            csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
            
            for row in csv_reader:
                
                if len(row)<4:
                    continue
                
                
                uris = row[3].split(" ")
                
                for i in range(len(uris)):
                    self.entities.add(uris[i])
                    
                i=1
                while i<len(uris):
                    
                    if self.isValidURI(uris[0]) and self.isValidURI(uris[i]):
                        e_uri1 = URIRef(uris[0])
                        e_uri2 = URIRef(uris[i])
                        self.rdfgraph.add( (e_uri1, URIRef(OWL.SAMEAS), e_uri2) )
                    else:
                        pass
                        #print("Not valid URI?", uris[0], uris[i])
                    i+=1
                    
                    
                    
            for ent in self.entities:
                if self.isValidURI(ent):
                    e_uri = URIRef(ent)
                    self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) )
                else:
                    pass
                    #print("Not valid URI:", ent)
                
            print("Number of entities: " + str(len(self.entities)))
    
    
    
    
    def getTargetColumns(self, cea_gt_file):
        
        self.target_column = dict()
               
        #An alternative is to automatically identify the left most column with an entity mention.
        #In this particular case we know the target
        with open(cea_gt_file) as csv_file:
            
            csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
                        
            for row in csv_reader:
                
                if row[0] not in self.target_column or int(self.target_column[row[0]]) > int(row[1]):
                    self.target_column[row[0]] = row[1] 
                    
                    
                    
    
    def getEntitiesLookup(self, folder_cea_tables, cea_gt_file):
                
        #Lookup call for each cell in target column
        
        
        #Dictionary or cache to avoid repeated look-up
        visited_values = set() 
        
        
        #Get Target column
        self.getTargetColumns(cea_gt_file)
        
        csv_file_names = [f for f in listdir(folder_cea_tables) if isfile(join(folder_cea_tables, f))]
        
        i=0
        n=len(csv_file_names)
        t=[1, 5, 10, 50, 100, 250, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000]

        for csv_file in csv_file_names:
            
            i+=1
            
            if i in t:
                print("Getting look up entities for table %s of %s (%s)." % (i, n, datetime.datetime.now().time()))
            
            table_name = csv_file.replace(".csv", "")
            
            with open(join(folder_cea_tables, csv_file)) as csv_file:
            
                csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
                
                if table_name in self.target_column:
                    target_column = self.target_column[table_name]
                else: #End
                    continue
                    
                for row in csv_reader:
                    if len(row)<=int(target_column): 
                        continue
                    
                    if row[int(target_column)] not in visited_values:
                        
                        ##To avoid repetition
                        visited_values.add(row[int(target_column)])
                               
                        #Lookup top-3
                        dbpedia_entities = self.lookup.getKGEntities(row[int(target_column)], 3, '')
                        for ent in dbpedia_entities:
                            
                            if self.isValidURI(ent.getId()):
                                self.entities.add(ent.getId()) ##Add to entities to extract neighbours
                                
                                e_uri = URIRef(ent.getId())
                                self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) )
                                
                                for cls_type in ent.getTypes(KG.DBpedia):
                                    self.rdfgraph.add( (e_uri, RDF.type, URIRef(cls_type)) )
                                
                            else:
                                #print("Not valid URI:", ent.getId())
                                pass
                            
                            
                            
                            
                            
                    
        
        print("Number of extended entities with look-up: " + str(len(self.entities)))
        
    
    
    def getEntitiesLookupForTable(self, csv_file):
                
        #Lookup call for each cell in target column
        
        
        #Dictionary or cache to avoid repeated look-up
        visited_values = set() 
            
        table_name = csv_file.replace(".csv", "")
            
        with open(join(folder_cea_tables, csv_file)) as csv_file:
            
                csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
                
                if table_name in self.target_column:
                    target_column = self.target_column[table_name]
                else: #End
                    return
                    
                for row in csv_reader:
                    if len(row)<=int(target_column): 
                        return
                    
                    if row[int(target_column)] not in visited_values:
                        
                        ##To avoid repetition
                        visited_values.add(row[int(target_column)])
                               
                        #Lookup top-3
                        dbpedia_entities = self.lookup.getKGEntities(row[int(target_column)], 3, '')
                        for ent in dbpedia_entities:
                            
                            if self.isValidURI(ent.getId()):
                                self.entities.add(ent.getId()) ##Add to entities to extract neighbours
                                
                                e_uri = URIRef(ent.getId())
                                self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) )
                                
                                for cls_type in ent.getTypes(KG.DBpedia):
                                    self.rdfgraph.add( (e_uri, RDF.type, URIRef(cls_type)) )
                                
                            else:
                                #print("Not valid URI:", ent.getId())
                                pass
                            
                            
       
            
    
    
    def getTypes(self, cta_file):
        with open(cta_file) as csv_file:
            
            csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
            
            for row in csv_reader:
                
                if len(row)<3:
                    continue
                
                
                self.types.add(row[2])
                
            print("Number of types: " + str(len(self.types)))



    def getAssertionsForInstances(self, use_local_graph):
        
        #avoid some properties (see entity.py)

        #Differentiate between object and data properties? probably only necessary if literal or URI
        
        #Problem if range of property is not string. It will probably not match very well in any case.
        #Solution: remove domains and ranges in dbpedia ontology properties
        #Filter by dbpedia resources and types, eg: ignore URis from wikidata and types from other taxonomies.
        
        n=0
        
        l=[1, 5, 100, 1000, 2000, 3000, 4000, 5000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000]
        
        
        for ent in self.entities:
            
            n+=1
            
            if self.isValidURI(ent):
            
                e_uri = URIRef(ent)
                
                #if n in l:
                #    print("Extracting neighbourhood for " + str(n) + ": " + ent + " (" + str(datetime.datetime.now().time()) + ")")
                
                
                if use_local_graph:
                    dict_results = self.getLocalTriplesForSubject(ent, 100)
                else:
                    dict_results = self.dbp_endpoint.getTriplesForSubject(ent, 100)
                
                
                
                for prop in dict_results:
                    
                    #if prop.startswith("http://dbpedia.org/"): #There are othe rinteresting properties: rdfs:labels, rdf:type, fiar:nameetc
                        
                        if self.isValidURI(prop):
                        
                            p_uri = URIRef(prop)
                            
                            isObjectProperty = self.identifyTypeOfProperty(prop)
                            
                            
                            for obj in dict_results[prop]:
                                
                                #Triple to resource
                                if obj.startswith("http") and isObjectProperty:
                                    
                                    if obj.startswith("http://dbpedia.org/resource/"):
                                    
                                        if self.isValidURI(obj):                                
                                            o_uri = URIRef(obj)                                
                                            self.rdfgraph.add( (e_uri, p_uri, o_uri) )
                                        else:
                                            #print("Not valid URI:", obj)
                                            pass
                                
                                elif not isObjectProperty: #Triple to Literal                            
                                    self.rdfgraph.add( (e_uri, p_uri, Literal(obj)) )
                                else:
                                    #print("Wrong object '%s' for property '%s' (isObjectProperty=%s)" % (obj, prop, isObjectProperty) )
                                    pass
                        else:
                            #print("Not valid URI:", prop)
                            pass
            else:
                pass
                #print("Not valid URI:", ent)
                    
                    
    def getLocalTriplesForSubject(self, ent, limit):
        
        query_str = "SELECT DISTINCT ?p ?o WHERE { <" + ent + "> ?p ?o  } limit " + str(limit)
        
        query_object = prepareQuery(query_str)#, initNs={CMR_QA.NAMESPACE_PREFIX : CMR_QA.BASE_URI})
            
        results = self.localrdfgraph.query(query_object)
        
        assertions = dict()
        
        
        for result in results:
            #print(str(result[0]), str(result[1]))
            prop = str(result[0])
            obj = str(result[1])
            if prop not in assertions:
                assertions[prop]=set()
            assertions[prop].add(obj)
        
        #print(assertions)
        
        return assertions
        
            
    def identifyTypeOfProperty(self, prop):
        
        if prop in self.propertyType:
            if self.propertyType[prop]:
                self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLOBJECTPROPERTY)) )
            else:
                self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLDATAPROPERTY)) )  
            
            return self.propertyType[prop]
        
        #Get statistics from endpoint        
        values = self.dbp_endpoint.getSomeValuesForPredicate(prop)
        
        n_values = len(values)
        n_uris = 0
        
        for v in values:
            if v.startswith("http"):
                n_uris+=1
        
        isObjectProperty =  (n_uris > (n_values/2))
        
        if isObjectProperty:
            self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLOBJECTPROPERTY)) )
            self.propertyType[prop]=True                                                                    
        else:
            self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLDATAPROPERTY)) )
            self.propertyType[prop]=False
            
        
        return isObjectProperty
        
        
    
    
    def getInstancesForTypes(self):
        #Use basic method
        
        additional_entities = set()
        
        for cls in self.types:
            
            #print("Extracting members for: " + cls)
            additional_entities = self.dbp_endpoint.getEntitiesForType(cls, 0, 100)
            
            #We also want to extract neighbourhood
            self.entities.update(additional_entities)
            
            for ent in additional_entities:
                
                if self.isValidURI(ent):
                    e_uri = URIRef(ent)
                    if cls.startswith("http://dbpedia.org/"):
                        self.rdfgraph.add( (e_uri, RDF.type, URIRef(cls)) )
                else:
                    #print("Not valid URI:", ent)
                    pass
        
        
        print("Number of extended entities with types: " + str(len(self.entities)))
                  


    #Using pre-extracted ttl/cache
    def localPropertyTypeExtractor(self):
    
            
        query_str = "SELECT DISTINCT ?p  WHERE { ?s ?p ?o . } "
        
        query_object = prepareQuery(query_str)#, initNs={CMR_QA.NAMESPACE_PREFIX : CMR_QA.BASE_URI})
        
        predicates = self.localrdfgraph.query(query_object)
        
        print("Using %s local predicates" % (len(predicates)))
        
        for p in predicates:
         
            #print(p)
            #continue
            prop = str(p[0])
            
            #print(prop)
            
            if not prop.startswith("http://dbpedia.org/"):  
                #we ignore other type of properties. Focus on dbpedia ones. 
                #Others will be trreates as annotation (rdfs:label, foaf:name) or specially (rdf:type)
                continue
            
        
            query_str = "SELECT ?value WHERE { ?s <" + prop + "> ?value . } limit 100"
            
            #print(query_str)
            #continue
            #print("lalala")
            
            query_object = prepareQuery(query_str)#, initNs={CMR_QA.NAMESPACE_PREFIX : CMR_QA.BASE_URI})
            
            values = self.localrdfgraph.query(query_object)
            
            n_values = len(values)
            n_uris = 0
            
            for v in values:
                #print(v[0])
                if str(v[0]).startswith("http"):
                    n_uris+=1
            
            
            if n_values==1:
                isObjectProperty = (n_uris == n_values)
            else:   
                isObjectProperty = (n_uris > (n_values/2))
            
            #print("New: " + prop)
            if isObjectProperty:                
                #self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLOBJECTPROPERTY)) )
                self.propertyType[prop]=True                                                                    
            else:
                #self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLDATAPROPERTY)) )
                self.propertyType[prop]=False

Пример #10

Показать файл

    def __init__(self):

        self.smartlookup = Lookup()
        self.smartendpoint = Endpoint()
        self.dbpedia_ep = DBpediaEndpoint()

Пример #11

Показать файл

class JSONUtilities(object):
    def __init__(self):

        self.smartlookup = Lookup()
        self.smartendpoint = Endpoint()
        self.dbpedia_ep = DBpediaEndpoint()

    def validateEntityToClasses(self, path, file_in, file_out):

        with open(path + file_in) as f:
            data = json.load(f)

        data_new = dict()

        no_types = 0
        empty_ref = 0
        missing_cases = 0
        wrong_cases = 0
        empty_cases = 0

        tmp_f = open(path + file_out.replace('.json', '') + '.csv', 'w')
        tmp_f2 = open(path + file_out.replace('.json', '') + '_issues.csv',
                      'w')

        for entity in data:

            types_tocheck = set(data[entity])
            types_ref = self.smartlookup.getTypesForEntity(entity, KG.DBpedia)

            if is_empty(types_ref):

                if is_empty(types_tocheck):
                    #Some issues with disambiguation pages
                    no_types += 1
                else:
                    ##Solved!
                    empty_ref += 1  #Some uris are redirects...

                #We use the original types
                data_new[entity] = data[entity]

                tmp_f.write('%s,%s\n' % (entity, ",".join(types_tocheck)))

                continue

            #New set of corrected types
            data_new[entity] = list(types_ref)  #json expects a list

            tmp_f.write('%s,%s\n' % (entity, ",".join(types_ref)))

            #print("Checking", entity, len(types_ref), len(types_tocheck))
            #print("Checking %s: %s vs %s" % (entity, types_ref, types_tocheck))

            #Statistics
            missing = types_ref.difference(types_tocheck)
            wrong = types_tocheck.difference(types_ref)

            if len(missing) > 0 or len(wrong) > 0:
                print("Issues with: " + entity)
                if len(missing) > 0:
                    print("\tMissing types: ", missing)
                    missing_cases += 1
                    if len(types_tocheck) == 0:
                        empty_cases += 1

                if len(wrong) > 0:
                    print("\tWrong types", wrong)
                    wrong_cases += 1

                tmp_f2.write("Entity,%s.\nMising,%s\nWrong:%s\n" %
                             (entity, ",".join(missing), ",".join(wrong)))

        #We save the new types
        self.dumpJsonFile(data_new, path + file_out)

        tmp_f2.write("Cases with wrong types: %s\n" % (str(wrong_cases)))
        tmp_f2.write("Cases with missing types: %s\n" % (str(missing_cases)))
        tmp_f2.write("Cases with empty types : %s\n" % (str(empty_cases)))
        tmp_f2.write("Cases with empty new types: %s\n" % (str(empty_ref)))
        tmp_f2.write("Cases with no types at all: %s\n" % (str(no_types)))

        tmp_f.close()
        tmp_f2.close()

        print("Cases with wrong types: " + str(wrong_cases))
        print("Cases with missing types: " + str(missing_cases))
        print("Cases with empty types: " + str(empty_cases))
        print("Cases with empty new types: " + str(empty_ref))
        print("Cases with no types at all: " + str(no_types))

    def createTriplesForClasses(self, path, class_file_r, class_file_s,
                                file_out):

        tmp_f = open(path + file_out.replace('.json', '') + '.csv', 'a+')

        #Read candidate classes
        classes = set()
        e_classes = json.load(open(path + class_file_r))
        for c_list in e_classes.values():
            for c in c_list:
                classes.add(c)

        #print(len(classes))

        e_classes = json.load(open(path + class_file_s))
        for c_list in e_classes.values():
            for c in c_list:
                classes.add(c)

        #print(len(classes))

        #Play with different numbers depending on the cost....
        #For each class extract 50-100-200 entities

        #Tests
        #entities = self.smartendpoint.getEntitiesForDBPediaClass("http://dbpedia.org/ontology/BaseballTeam", 100)
        #for e, label in entities.items():
        #    print(e, list(label)[0])
        #classes = set()

        #Dict to convert to jason
        #class_triples = dict()
        cache_file = path + file_out
        class_triples = json.load(
            open(cache_file)) if os.path.exists(cache_file) else dict()

        print("Class triples initial size", str(len(class_triples)))

        for c_uri in classes:

            print(c_uri)

            if c_uri in class_triples:  #already analysed/cached
                print("\tAlready cached!")
                continue

            #if len(class_triples)>5:
            #    break

            i = time.time()

            tmp_f.write('%s\n' % (c_uri))

            #Dictionary entity-label
            entities = self.smartendpoint.getEntitiesForDBPediaClass(
                c_uri, 500)

            #For each above entity (?o) extract triples ?s ?p ?o, together with the label of ?o
            #Extract from 10-50 triples for entity, filter ?p NOT IN: show top ones we aim at discard

            triples = list()

            for object_uri in entities:
                '''
                '''
                #label
                label = list(entities[object_uri])[0]

                #Triples for object entity
                subjects_predicates = self.dbpedia_ep.getTriplesForObject(
                    object_uri, 50)

                for subject in subjects_predicates:
                    for predicate in subjects_predicates[subject]:

                        triple = [subject, predicate, object_uri, label]
                        triples.append(triple)

                        tmp_f.write('%s\n' % (",".join(triple)))

            #end for entities

            print("\tTriples", len(triples))
            class_triples[c_uri] = triples

            #We dump, so that if it we breaks we can continue from there
            self.dumpJsonFile(class_triples, path + file_out)

            e = time.time()

            print("Time:", e - i)

        #end for classes

        #We save the new triples
        tmp_f.close()
        print(len(class_triples), path + file_out)
        self.dumpJsonFile(class_triples, path + file_out)

    #TBC
    def validateClassTriples(self, file):

        with open(file) as f:
            data = json.load(f)

            predicate_count = dict()

            n_triples = 0

            empty_entries = 0

            for entity in data:
                subjects = set()
                predicates = set()
                objects = set()

                print(entity, len(data[entity]))

                if len(data[entity]) == 0:
                    empty_entries += 1

                n_triples += len(data[entity])

                n_triples_class = 0

                for triple in data[entity]:

                    if triple[1] in URI_KG.avoid_predicates:
                        continue

                    if not triple[1].startswith(
                            URI_KG.dbpedia_uri) and not triple[1].startswith(
                                URI_KG.dbpedia_uri_property):
                        continue

                    n_triples_class += 1

                    subjects.add(triple[0])

                    if triple[1] not in predicate_count:
                        predicate_count[triple[1]] = 0

                    predicate_count[triple[1]] += 1

                    predicates.add(triple[1])
                    objects.add(triple[2])
                    #print("\t",data[entity][0])

                print(
                    "\t Different Triples, Subjects, predicates, objects: %s, %s, %s, %s"
                    % (str(n_triples_class), str(len(subjects)),
                       str(len(predicates)), str(len(objects))))

            print("Empty entries", empty_entries)

            predicate_count_sorted = OrderedDict(
                sorted(predicate_count.items(), key=lambda x: x[1]))

            #for k, v in predicate_count_sorted.items():
            #    print(k, v)

            print(len(data), n_triples)

    def dumpJsonFile(self, data_json, file):
        with open(file, "w") as write_file:
            json.dump(data_json, write_file)

Пример #12

Показать файл

class Endpoint(object):
    '''
    This class aim at identifying errors in DBpedia ENDPOINT when retrieving samples for training
    Positive/negative samples for candidate classes 
    '''
    '''
    def queryTripleByClass(top_k, c):
    triples = list()
    s = sparql.Service(SPARQL_END_POINT, "utf-8", "GET")
    statement = 'select distinct str(?s), str(?p), str(?o), str(?l) where {?s ?p ?o. ?o rdf:type <%s>. ' \
                '?o rdfs:label ?l. FILTER( langMatches(lang(?l), "en"))} ORDER BY RAND() limit %d' % (c, top_k)
    result = s.query(statement)
    for row in result.fetchone():
        triples.append([row[0], row[1], row[2], row[3]])
    return triples
    '''
    def __init__(self):
        '''
        Constructor
        '''

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()

        self.lookup = Lookup()

    def __analyseEntityPredicateStrategy(self, ent, cls_uri):
        '''
        Analyses correctness of cls_uri as type of ent using Predicate types strategy
        '''

        predicate_types = self.lookup.__getTypesPredicateStrategy(ent)

        if len(predicate_types) == 0:
            return None

        if cls_uri in predicate_types:
            return True

        return False

    def __analyseEntityLooukStrategy(self, ent, cls_uri):
        '''
        Analyses correctness of cls_uri as type of ent using Look-up types
        '''

        #Note that if not look-up types, then we return the sparql types as they are
        ##IF lookup types, the sparql types must be compatible
        clean_lookup_types = self.lookup.getTypesForEntity(ent, KG.DBpedia)

        if len(clean_lookup_types) == 0:
            return None

        if cls_uri in clean_lookup_types:
            return True

        return False

    def __analyseEntityWikidataStrategy(self, ent, cls_uri, wikidata_classes):
        '''
        Analyses correcteness of cls_uri as type of ent using wikidata
        '''

        #b. Get equivalent wikidata entity (if any)
        same_entities = self.dbpedia_ep.getSameEntities(ent)

        wikidata_entities = getFilteredResources(
            same_entities, KG.Wikidata)  ##typically one entity

        ##If no equivalent entities we then go for the lookup strategy
        if len(wikidata_entities) == 0:
            return self.__analyseEntityLooukStrategy(ent, cls_uri)

        #print(wikidata_entities)
        for wk_ent in wikidata_entities:
            #c. Check if wikidata type from (a) is within types of equivalent entity from (b)

            #print("\t"+wk_ent)
            wk_ent_types = self.wikidata_ep.getAllTypesForEntity(
                wk_ent)  #we consider supertypes to extend compatibility
            time.sleep(0.01)  #to avoid limit of calls

            intersect = wk_ent_types.intersection(wikidata_classes)

            if len(intersect) > 0:
                return True

        return False

    def getEntitiesForDBPediaClass(self, cls_uri, limit=1000):
        '''
        It currently expects a URL from DBpedia
        '''

        ##We query a subset of entities for sampling
        clean_db_entities = dict()

        offset = 0

        #To guarantee the required number of (clean) entities for the class
        while len(clean_db_entities) < limit:

            #db_entities = self.dbpedia_ep.getEntitiesForType(cls_uri, offset*limit*5, limit*5) #We extract more than required as many of them will be noisy
            db_entities = self.dbpedia_ep.getEntitiesLabelsForType(
                cls_uri, offset * limit * 5, limit * 5)
            #print("Entities",len(db_entities))

            #For wikidata strategy
            #a. Get equivalent class from wikidata (if any)
            db_eq_cls = self.dbpedia_ep.getEquivalentClasses(cls_uri)
            wikidata_classes = getFilteredTypes(
                db_eq_cls, KG.Wikidata)  ##typically one class

            filtered_look_up = 0
            filtered_wikidata = 0
            filtered_predicates = 0

            for ent in db_entities:
                if len(clean_db_entities) >= limit:
                    print("%d, %d, %d, %d" %
                          (len(clean_db_entities), filtered_look_up,
                           filtered_predicates, filtered_wikidata))
                    return clean_db_entities

                results_look_up = self.__analyseEntityLooukStrategy(
                    ent, cls_uri)

                if results_look_up == None:

                    results_predicates = self.__analyseEntityPredicateStrategy(
                        ent, cls_uri)

                    if results_predicates == None:

                        if self.__analyseEntityWikidataStrategy(
                                ent, cls_uri, wikidata_classes
                        ):  #wikidata strategy (it is very costly)
                            clean_db_entities[ent] = db_entities[ent]
                        else:
                            filtered_wikidata += 1
                            #print ("Entity filtered by wikidata", ent)

                    elif results_predicates:  #passed predicates strategy
                        clean_db_entities[ent] = db_entities[ent]
                    else:
                        #print ("Entity filtered by predicates", ent)
                        filtered_predicates += 1

                elif results_look_up:  #passed look-up strategy
                    clean_db_entities[ent] = db_entities[ent]
                else:
                    #print ("Entity filtered by look-up", ent)
                    filtered_look_up += 1

            #OLD STRATEGY: too slow
            #if (len(wikidata_classes)==0): ## No wikidata class then look-up strategy
            #    for ent in db_entities:
            #        if len(clean_db_entities)>=limit:
            #            return clean_db_entities
            #        if self.__analyseEntityLooukStrategy(ent, cls_uri):
            #            clean_db_entities.add(ent)
            #else:
            #    for ent in db_entities:
            #        if len(clean_db_entities)>=limit:
            #            return clean_db_entities
            #        if self.__analyseEntityWikidataStrategy(ent, cls_uri, wikidata_classes):
            #            clean_db_entities.add(ent)

            #print(len(clean_db_entities))
            offset += 1

            #Limit of iterations
            if offset > 5:
                print("%d, %d, %d, %d" %
                      (len(clean_db_entities), filtered_look_up,
                       filtered_predicates, filtered_wikidata))
                return clean_db_entities

        print("%d, %d, %d, %d" % (len(clean_db_entities), filtered_look_up,
                                  filtered_predicates, filtered_wikidata))
        return clean_db_entities

Пример #13

Показать файл

class Lookup(object):
    '''    
    This class aim at providing a lookup access to the KG leading to minimal errors
    It will also optionally combine. No only one KG but several 
    '''
    def __init__(self):  #KGraph=KG.DBpedia
        '''
        Constructor
        '''
        #Return types from this knowledge graph
        #self.KGraph = KGraph

        self.dbpedia_onto = DBpediaOntology()
        self.dbpedia_onto.loadOntology(True)
        self.schema_onto = SchemaOrgOntology()
        self.schema_onto.loadOntology(True)

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()

    def getTypesForEntity(self, uri_entity, kg=KG.DBpedia):

        #print("QUERY",uri_entity)

        if kg == KG.DBpedia:
            types = set()
            types_redirects = set()

            #Original entity
            types.update(self.getTypesForDBPediaEntity(uri_entity))

            #Redirects if any
            #See dbo:wikiPageRedirects -> similar to same_as inside dbpedia
            redirects = self.dbpedia_ep.getWikiPageRedirect(uri_entity)

            for uri_redirect in redirects:  #Typically only one
                types_redirects.update(
                    self.getTypesForDBPediaEntity(uri_redirect))

            if len(types) == 0:  #We use the ones of the redirects
                types.update(types_redirects)
            else:  #types of redirects can be dirty
                for t in types_redirects:
                    if self.__checkCompatibilityTypes(t, types):
                        types.add(t)

            #Commented because it was slow for a large dataset
            #if is_empty(types) or (len(types)==1 and "Agent" in list(types)[0]):
            #    #Wikidata strategy to complement if empty endpoint and look-up or only type "Agent"
            #    print(types)
            #    types.update(self.__getTypesWikidataStrategy(uri_entity))
            #    print(types)

            return types

        #TBC
        elif kg == KG.Wikidata:
            pass
        elif kg == KG.Google:
            pass

        return set()

    def __getTypesLookupStrategy(self, uri_entity):

        kg = KG.DBpedia

        label = uri_entity
        if uri_entity.startswith(URI_KG.dbpedia_uri_resource):
            label = uri_entity.replace(URI_KG.dbpedia_uri_resource, '')

        ##we call our method to get look-up types for the URI. Only SPARQL endpoint types may contain errors
        #It also includes wikidata strategy inside
        entities = self.getKGEntities(label, 10,
                                      uri_entity)  #fulter by uri_entity

        ##In case not match in look up
        if is_empty(entities):

            types = set()

            #Dbpedia Enpoint strategy
            types_endpoint = getFilteredTypes(
                self.dbpedia_ep.getAllTypesForEntity(uri_entity), KG.DBpedia)

            #Predicates strategy (uses top types)
            types_domain_range = self.__getTypesPredicateStrategy(uri_entity)

            if len(types_domain_range) > 0:

                #They can be noisy, so do not add them yet
                #types.update(types_domain_range)

                ##Check compatibility of types_endpoint
                for t in types_endpoint:

                    if t not in types_domain_range:
                        if self.__checkCompatibilityTypes(
                                t, types_domain_range):
                            types.add(t)

                #If no compatible types we just use the ones coming from domain/ranges
                if len(types) > 0:
                    types.update(types_domain_range)

            #If still empty we use
            if len(types) == 0:
                #We add endpoint types
                types.update(types_endpoint)

            return types

        else:
            ##shoudl be only one element from lookup
            for entity in entities:
                return entity.getTypes(kg)

    def __getTypesPredicateStrategy(self, uri_entity):
        '''
        Exploits the domain and range types of the predicates in triples with uri_entity as subject or object
        '''

        types = set()

        #Top-2
        types.update(
            getFilteredTypes(
                self.dbpedia_ep.getTopTypesUsingPredicatesForObject(
                    uri_entity, 2), KG.DBpedia))

        #We use top one here, as there less properties associated to an entity. And We only need one wrong to be in the top-k

        #Top-1
        #Error-prone as many properties are not properly used
        #Only uses if current range types are compatible
        types_domain = getFilteredTypes(
            self.dbpedia_ep.getTopTypesUsingPredicatesForSubject(
                uri_entity, 1), KG.DBpedia)

        #if len(types)==0:
        #    types.update(types_domain)
        #else:
        if len(types) > 0:
            for t in types_domain:
                if self.__checkCompatibilityTypes(t, types):
                    types.add(t)

        #TODO: Alternative Strategies: use intersection of types_range and types_domain in non empty. May increase recall (remove min input/output edges in queries) and precision
        #If empty then use as now.

        return types

    def __getTypesWikidataStrategy(self, uri_entity):

        print("\tUsing wikidata strategy for " + uri_entity)

        #Gets equivalent wikidata entities
        same_entities = self.dbpedia_ep.getSameEntities(uri_entity)
        wikidata_entities = getFilteredResources(
            same_entities, KG.Wikidata)  ##typically one entity

        wk_ent_types = set()
        dp_types = set()
        dp_types_all = set()

        if len(wikidata_entities) == 0:
            return wk_ent_types

        for wk_ent in wikidata_entities:

            #print("WK ent: "+wk_ent)

            #Get types for wikidata entities
            wk_ent_types.update(self.wikidata_ep.getAllTypesForEntity(
                wk_ent))  #we consider all supertypes to extend compatibility

            #Problematic concept
            #Wikimedia disambiguation page
            if URI_KG.wikimedia_disambiguation_concept in wk_ent_types:
                wk_ent_types.clear()

            #Check if: wk_ent_types

        for t in wk_ent_types:
            #print("WK cls: " +t)
            #Get equivalent dbpedia types
            #print(self.wikidata_ep.getEquivalentClasses(t))
            dp_types.update(
                getFilteredTypes(self.wikidata_ep.getEquivalentClasses(t),
                                 KG.DBpedia))

        #get superclasses
        for t in dp_types:
            #print("DBp type: " +t)
            dp_types_all.update(self.dbpedia_ep.getAllSuperClasses(t))

        return getFilteredTypes(dp_types_all, KG.DBpedia)

    def getTypesForDBPediaEntity(self, uri_entity):

        #types=set()

        #Types from DBpedia Endpoint may be dirty. So we use 2 strategies: wikidata and lookup types
        #TODO: check compatibility among strategies?

        #Look-up strategy  also includes wikidata strategy
        types = self.__getTypesLookupStrategy(uri_entity)

        #print("Main", types)
        return types

        ##Additional strategy...
        #Check equivalent entity from wikidata, get classes from wikidata, get equivalent in dbpedia enpoint

    def getKGEntities(self, cell, limit=5, filter=''):
        '''
        Given the text of a cell extracts entity objects.
        Note that an entity contains an id, a label, a description, a set of types from dbpedia, wikidata and schema.org,
        and the source (dbpedia, wikidata or google)
        '''

        #Strategy:
        #0. Incremental repair: start with sth simple
        #1. Identify cases where dbpedia lookup and endpoint do not agree
        #2. We have dbpedia (may require a fix...) and schema.org taxonomies to identify conflicting branches
        #3. Use alignment to identify same entities and have more evidence about types
        #3a. Lexical
        #3b. Based on embeddings
        #4. If not provided mapping among classes (there seem to be available mappings), we may use alignment as well (lexical and embedding)

        query = cell

        #Get KG entities from DBpedia, Wikidata and KG
        #One could also return the most accurate 5 types combining the 3 KGs... (limit 20 of each of them and then retrieve top-k)
        dbpedia = DBpediaLookup()
        dbpedia_entities = dbpedia.getKGEntities(query, limit, filter)

        #We complement with types from endpoint and check if they are correct/compatible
        for entity in dbpedia_entities:
            self.__analyseEntityTypes(entity)

        return dbpedia_entities

        #Next steps
        #Find equivalent entities from wikidata (using both wikidata and dbpedia endpoints),
        #then its types and then try to find conflictive types (it could even be by voting)
        '''
        kg = GoogleKGLookup()
         
        wikidata = WikidataAPI()
        
       
        '''

    def __analyseEntityTypes(self, entity):

        #print(entity.getId())

        #print("\t"+str(entity.getTypes(KG.DBpedia)))

        #Filter by type?
        types_endpoint = getFilteredTypes(
            self.dbpedia_ep.getAllTypesForEntity(entity.getId()), KG.DBpedia)

        #print("\t"+str(types_endpoint))

        if len(entity.getTypes()) > 0:

            for t in types_endpoint:

                if t not in entity.getTypes():

                    ##Evaluate compatibility with lookup types.
                    ##In same branch
                    ##We use DBpedia for now
                    if self.__checkCompatibilityTypes(
                            t, entity.getTypes(KG.DBpedia)):
                        entity.addType(t)

        else:  #No types from lookup

            #We use wikidata strategy
            #Not great for compatibility as we need to better explore the returned types
            #types_wk_strategy = self.__getTypesWikidataStrategy(entity.getId())

            #We use range-domain-predicate strategy (uses top-types)
            types_domain_range = self.__getTypesPredicateStrategy(
                entity.getId())

            if len(types_domain_range) > 0:

                #They can be noisy, so do not add them yet
                #entity.addTypes(types_domain_range)

                ##Check compatibility of types_endpoint
                for t in types_endpoint:

                    if t not in types_domain_range:
                        if self.__checkCompatibilityTypes(
                                t, types_domain_range):
                            entity.addType(t)

                #DANGEROUS, as domain and range types contain amny errors
                #If no compatible type we just use the ones coming from domain/ranges
                #if len(entity.getTypes())>0:
                #    entity.addTypes(types_domain_range)

            #If still empty we use endpoint
            if len(entity.getTypes()) == 0:
                #We add endpoint types
                entity.addTypes(types_endpoint)

            ##Last resource if not types
            if len(entity.getTypes()) > 0:
                entity.addTypes(types_domain_range)

            #We complement with wikidata strategy
            #entity.addTypes(types_wk_strategy)

        #print("\t"+str(entity.getTypes(KG.DBpedia)))

    '''
    We check if the source type (endpoint) is among descendants or ancestors of at least one of the target types (lookup)
    '''

    def __checkCompatibilityTypes(self, cls_source_uri, target_types):

        for cls_target_uri in target_types:
            if self.__isCompatibleType(cls_source_uri, cls_target_uri):
                return True

        return False

    '''
    We check if the source type is among descendants or ancestors of the target type
    '''

    def __isCompatibleType(self, cls_source_uri, cls_target_uri):

        cls_source = self.dbpedia_onto.getClassByURI(cls_source_uri)
        cls_target = self.dbpedia_onto.getClassByURI(cls_target_uri)

        ##TODO  We rely on DBpedia only for now
        if cls_source == None or cls_target == None:
            return False

        ancestors = self.dbpedia_onto.getAncestorsURIs(cls_target)
        descendants = self.dbpedia_onto.getDescendantURIs(cls_target)

        if cls_source_uri in ancestors or cls_source_uri in descendants:
            return True

        return False

Пример #14

Показать файл

Файл: 040_create_vectors_newpos2.py Проект: eljne/tabular-data-semantics-py

''' author: Eleanor Bill @eljne '''
''' create vectors for additional training data - +ve - CONTINUED'''
''' takes about four-five hours w/10 samples per question - 180,000 '''

from kg.EB_classes import pickl, unpickle, nouns_list, noun_phrases_list
import pandas as pd
from kg.endpoints import DBpediaEndpoint

pos = unpickle('training_vectors/11_train_new_positive_samples')
new_positive_samples = pd.DataFrame(pos)
print('unpickled')
ep = DBpediaEndpoint()
# print(new_positive_samples.head)


def get_nouns(entity):
    labels = ep.getEnglishLabelsForEntity(entity)
    nouns = nouns_list(labels)
    print('.')
    return nouns


def get_nps(entity):
    labels = ep.getEnglishLabelsForEntity(entity)
    nps = noun_phrases_list(labels)
    print('..')
    return nps


def apply_endpoint_alt(entity):  # find types
    types = ep.getTypesForEntity(entity)  # limit to 5