Exemplo n.º 1
0
    def __init__(self):
        '''
        Constructor
        '''

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()

        self.lookup = Lookup()
Exemplo n.º 2
0
def apply_endpoint(entity):  # find types
    # print(entity)
    ep = DBpediaEndpoint()  # using ID/int
    ent2 = entity.getIdstr()
    types = ep.getTypesForEntity(ent2)  # limit to 5
    # print('types using endpoint id', types)
    if len(types) == 0:  # using entity: back up
        types = entity.getTypes()  # ont
        # print('types using entity', types, '\n')
    return types
Exemplo n.º 3
0
 def __init__(self):
     '''
     Constructor
     '''
     #Set up
     #self.setUpRDFGraph()
     self.entities = set()
     self.types = set()
     #prop: True -> isObjectProperty
     self.propertyType = dict()
     
     
     self.dbp_endpoint = DBpediaEndpoint()        
     self.lookup = DBpediaLookup()
Exemplo n.º 4
0
    def __init__(self):  #KGraph=KG.DBpedia
        '''
        Constructor
        '''
        #Return types from this knowledge graph
        #self.KGraph = KGraph

        self.dbpedia_onto = DBpediaOntology()
        self.dbpedia_onto.loadOntology(True)
        self.schema_onto = SchemaOrgOntology()
        self.schema_onto.loadOntology(True)

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()
Exemplo n.º 5
0
def apply_endpoint_list(entity_list):  # find types
    types_list = []
    ep = DBpediaEndpoint()  # using ID/int
    for entity in entity_list:
        print(entity)
        # ['N/A']
        # < id: http://dbpedia.org/resource/Ana_Popović, label: Ana Popović, description: None, types: set(), source: DBpedia >
        if entity != ['N/A']:
            ent2 = entity.getIdstr()
            types = ep.getTypesForEntity(ent2)  # limit to 5
            # print('types using endpoint id', types)
            types_list.append(types)
            if len(types) == 0:  # using entity: back up
                types = entity.getTypes()  # ont
                # print('types using entity', types, '\n')
                types_list.append(types)
        else:
            types = []
            types_list.append(types)
    return types_list
Exemplo n.º 6
0
def tablesToChallengeFormat(folder_gt, folder_tables, file_out_gt,
                            file_out_redirects_gt, file_out_gt_target,
                            max_tables):

    #csv_file_names = [f for f in listdir(folder_gt) if isfile(join(folder_gt, f))]

    csv_file_names = []
    csv_file_names.append("2014_Tour_of_Turkey#6.csv")

    f_out = open(file_out_gt, "w+")
    f_out_redirects = open(file_out_redirects_gt, "w+")
    f_out_target = open(file_out_gt_target, "w+")

    n_tables = 0
    wrong_entities = 0
    panda_errors = 0

    dbpedia_ep = DBpediaEndpoint()

    for csv_file_name in csv_file_names:

        #print(csv_file_name)

        with open(join(folder_gt, csv_file_name)) as csv_file:

            try:
                #Try to open with pandas. If error, then discard file
                pd.read_csv(join(folder_tables, csv_file_name),
                            sep=',',
                            quotechar='"',
                            escapechar="\\")
                #df = csv.reader(csv_file)
            except:
                panda_errors += 1
                continue

            table_id = csv_file_name.replace(".csv", "")

            csv_reader = csv.reader(csv_file,
                                    delimiter=',',
                                    quotechar='"',
                                    escapechar="\\")

            for row in csv_reader:

                #URI, text, row number
                #http://dbpedia.org/resource/Ahmet_%C3%96rken, Ahmet A\u0096rken, 1
                if len(row) < 3:
                    continue

                entity_uri = row[0]
                row_id = row[2]

                entity_mention = row[1]

                column_id = getColumnEntityMention(
                    join(folder_tables, csv_file_name), entity_mention)

                entities = set()
                new_entities = set()

                ##Consider redirects:
                #entities.update(dbpedia_ep.getWikiPageRedirect(entity_uri))
                #entities.update(dbpedia_ep.getWikiPageRedirectFrom(entity_uri))

                #for e in entities:
                #    new_entities.update(dbpedia_ep.getWikiPageRedirect(e))
                #    new_entities.update(dbpedia_ep.getWikiPageRedirectFrom(e))

                entities.add(entity_uri)
                #entities.update(new_entities)

                if column_id >= 0:
                    #Output
                    #table id,column id, row id and DBPedia entity
                    #9206866_1_8114610355671172497,0,121,http://dbpedia.org/resource/Norway
                    line_str = '\"%s\",\"%s\",\"%s\"' % (table_id, column_id,
                                                         row_id)
                    f_out_target.write(line_str + '\n')

                    #f_out.write('\"%s\",\"%s\",\"%s\",\"%s\"\n' % (table_id, column_id, row_id, entity_uri))

                    f_out.write(line_str + ',\"%s\"\n' % entity_uri)

                    #for ent in entities:
                    #    line_str += ',\"'+ ent + '\"'
                    line_str += ',\"' + " ".join(entities) + '\"'

                    f_out_redirects.write(line_str + '\n')

                    #TODO
                    #Read with pandas: https://www.datacamp.com/community/tutorials/pandas-read-csv
                    #There are some errors with "\"
                    #writer = csv.writer(f, quoting=csv.QUOTE_NONNUMERIC)
                    #writer.writerow()

                    #print('\"%s\",\"%s\",\"%s\",\"%s\"' % (table_id, column_id, row_id, entity_uri))
                else:
                    wrong_entities += 1

        ##Small dataset with only approx. 20k tables out of >400k
        if n_tables > max_tables:  #200000
            break
        n_tables += 1

    print("Panda errors: %d" % panda_errors)
    print("Wrong entities: %d" % wrong_entities)
    f_out.close()
    f_out_redirects.close()
    f_out_target.close()
Exemplo n.º 7
0
def extensionWithWikiRedirects(file_gt, folder_tables, file_out_gt,
                               file_out_redirects_gt, file_out_gt_target,
                               max_rows):
    #print(csv_file_name)

    f_out = open(file_out_gt, "a+")
    f_out_redirects = open(file_out_redirects_gt, "a+")
    f_out_target = open(file_out_gt_target, "a+")

    n_rows = 0
    panda_errors = 0

    dbpedia_ep = DBpediaEndpoint()

    table_id = ""

    dict_entities = dict()

    #READ CURRENT CACHE
    #init dict_entities with current state of file_out_redirects_gt
    with open(file_out_redirects_gt) as csv_file_redirections:

        csv_reader = csv.reader(csv_file_redirections,
                                delimiter=',',
                                quotechar='"',
                                escapechar="\\")

        #"1","0","1","http://dbpedia.org/resource/Uno_Von_Troil http://dbpedia.org/resource/Uno_von_Troil"
        for row in csv_reader:

            entity_list = row[3].split(" ")

            for e in entity_list:

                if e not in dict_entities:
                    dict_entities[e] = set(entity_list)

    with open(file_gt) as csv_file:

        csv_reader = csv.reader(csv_file,
                                delimiter=',',
                                quotechar='"',
                                escapechar="\\")

        for row in csv_reader:

            #file, col, row, URI
            #note that in Okties GT it is given file, row, col
            #1,1,0,http://dbpedia.org/resource/Uno_von_Troil

            if len(row) < 4:
                continue

            #entity_uri = row[3]
            entity_uri = row[3].replace("\"", "%22")

            #To avoid cases from "http://sws.geonames.org/"
            #if entity_uri.startswith("http://sws.geonames.org/"):
            same_as_resources = set()
            if not entity_uri.startswith("http://dbpedia.org/resource/"):
                #print(getFilteredResources(dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia))
                same_as_resources = getFilteredResources(
                    dbpedia_ep.getSameEntities(entity_uri), KG.DBpedia)
                #print(row[0])
                #print(row[1])
                #print(row[2])
                #print(entity_uri)

                if len(same_as_resources) == 0:
                    print("No dbpedia entity for: %s, %s, %s, %s" %
                          (row[0], row[1], row[2], entity_uri))
                else:
                    #We keep only one of the same_as dbpedia resoruces
                    for r in same_as_resources:
                        entity_uri = r

                    ##We will consider other same as later
                    same_as_resources.remove(entity_uri)

                #break

            entity_uri = row[3].replace("\"", "%22")

            #if int(row[0])<1000: #Jiaoyan starts from table file 1,000
            #if int(row[0])>=1000: #ernesto
            #if int(row[0])>=3: #ernesto
            #if int(row[0])<587 or int(row[0])>=1000:
            #    continue

            if not table_id == row[0]:

                #Change of table we close and then reopen again to keep a better storage of intermediate points
                f_out.close()
                f_out_redirects.close()
                f_out_target.close()

                table_id = row[0]
                print(table_id)

                f_out = open(file_out_gt, "a+")
                f_out_redirects = open(file_out_redirects_gt, "a+")
                f_out_target = open(file_out_gt_target, "a+")

            col_id = row[2]  #Reverse according to input
            row_id = row[1]

            csv_file_name = table_id + ".csv"

            try:
                #Try to open with pandas. If error, then discard file
                pd.read_csv(join(folder_tables, csv_file_name),
                            sep=',',
                            quotechar='"',
                            escapechar="\\")
                #df = csv.reader(csv_file)
            except:
                panda_errors += 1
                continue

            entities = set()

            ##Keep and index to avoid unnecessary queries
            if entity_uri in dict_entities:
                entities.update(dict_entities[entity_uri])
            else:
                #entities=set()
                new_entities = set()

                ##Consider redirects:
                entities.update(dbpedia_ep.getWikiPageRedirect(entity_uri))
                entities.update(dbpedia_ep.getWikiPageRedirectFrom(entity_uri))
                entities.update(
                    same_as_resources)  #in case there were more than one

                #two iterations
                for e in entities:
                    new_entities.update(dbpedia_ep.getWikiPageRedirect(e))
                    new_entities.update(dbpedia_ep.getWikiPageRedirectFrom(e))

                entities.add(entity_uri)
                entities.update(new_entities)

                dict_entities[entity_uri] = set()
                dict_entities[entity_uri].update(entities)

            #Output
            #table id, column id, row id and DBPedia entity
            #9206866_1_8114610355671172497,0,121,http://dbpedia.org/resource/Norway
            line_str = '\"%s\",\"%s\",\"%s\"' % (table_id, col_id, row_id)
            f_out_target.write(line_str + '\n')

            #f_out.write('\"%s\",\"%s\",\"%s\",\"%s\"\n' % (table_id, column_id, row_id, entity_uri))

            f_out.write(line_str + ',\"%s\"\n' % entity_uri)

            #for ent in entities:
            #    line_str += ',\"'+ ent + '\"'
            line_str += ',\"' + " ".join(entities) + '\"'

            f_out_redirects.write(line_str + '\n')

            ##Number of rows
            if n_rows > max_rows:  #200000
                break
            n_rows += 1

    print("Panda errors: %d" % panda_errors)
    f_out.close()
    f_out_redirects.close()
    f_out_target.close()
Exemplo n.º 8
0
from ontology.onto_access import DBpediaOntology

# unpickle
# load = unpickle('training_vectors/final_original_training_vectors') # when we have training data from task to eval
load = unpickle('training_vectors/final_original_training_vectors_minus_tests'
                )  # created own testing data from splitting train
df_positive = pd.DataFrame(load)
df_positive['polarity'] = "1"
'''create more positive samples
do this by:
- getting a different but similar entity using SPARQLEndpoint
'''

onto_access = DBpediaOntology()
onto_access.loadOntology(True)
ep = DBpediaEndpoint()  # getEntitiesForType


def get_alt_entities(entity_types):
    lis = []
    for ls in entity_types:
        # print('ls', ls)
        enty = get_last(ls)  # only get finest entity
        # print('entity:', enty)
        try:
            # simty = ep.getEntitiesForDBPediaClass(enty, 100) - slower version
            simty = ep.getEntitiesForType(enty, 0, 10)
            lis.append(simty)
            # print('similar entity', simty)
        except:
            pass
Exemplo n.º 9
0
class DBPediaExtractor(object):
    '''
    classdocs
    '''

    #Steps
    # Read entities from GT, include redirections and add same_as axioms
    # Query for types
    # Query for 1 level of relationships, filter those non in dbpedia ontology

    def __init__(self):
        '''
        Constructor
        '''
        #Set up
        #self.setUpRDFGraph()
        self.entities = set()
        self.types = set()
        #prop: True -> isObjectProperty
        self.propertyType = dict()
        
        
        self.dbp_endpoint = DBpediaEndpoint()        
        self.lookup = DBpediaLookup()
        
        
        
        
    
    
    def isValidURI(self, str_uri):
        
        #use term._is_valid_unicode(str_uri)
        
        return term._is_valid_uri(str_uri) and self.isascii(str_uri)
    
    
    def isascii(self, string_original):
        
        string = self.strip_accents(string_original)  #to ignore accents 
            
        return len(string_original) == len(string.encode())
    
    
    
    def strip_accents(self, text):
            
        text = unicodedata.normalize('NFD', text)\
               .encode('ascii', 'ignore')\
               .decode("utf-8")
    
        return str(text)    
    
    
    #Precomputed
    def setUpLocalDBPediaGraph(self, file_ttl):
        self.localrdfgraph = Graph()
        self.localrdfgraph.parse(source=file_ttl, format='turtle')
        
    
    
    #To be computed
    def setUpRDFGraph(self):
        self.rdfgraph = Graph()
        #self.rdfgraph.bind(TabularToRDF.NAMESPACE_PREFIX, TabularToRDF.BASE_URI)
        self.rdfgraph.bind("foaf", "http://xmlns.com/foaf/0.1/")
        self.rdfgraph.bind("dbp", "http://dbpedia.org/property/")
        self.rdfgraph.bind("dbr", "http://dbpedia.org/resource/")
        self.rdfgraph.bind("dbo", "http://dbpedia.org/ontology/")
        self.rdfgraph.bind("owl", "http://www.w3.org/2002/07/owl#")
        
        
    
    
    def saveRDFGrah(self, rdf_file_ouput):
        #output same table name as ttl
        self.rdfgraph.serialize(str(rdf_file_ouput), format='turtle')#xml,turtle
        
        
        wrong_file_name = ""
        
        try:
        
            if "?" in rdf_file_ouput:
                wrong_file_name = rdf_file_ouput.split("?")[0]
                os.rename(wrong_file_name, rdf_file_ouput)
            elif "#" in rdf_file_ouput:
                wrong_file_name = rdf_file_ouput.split("#")[0]
                os.rename(wrong_file_name, rdf_file_ouput)
            
                
            #print(wronf_file_name)
        except:
            print(wrong_file_name, rdf_file_ouput)



    def getTargetEntitiesCEA(self, cea_file):
        
        #Target entiteis for table
        self.targetEntities = dict()
        
        with open(cea_file) as csv_file:
            
            csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
            
            for row in csv_reader:
                
                if len(row)<4:
                    continue
                
                
                uris = row[3].split(" ")
                
                #entities per table
                key = row[0] #+ "-"+ row[1] + "-"+ row[2]
                
                if key not in self.targetEntities:  
                    self.targetEntities[key]=set()
                    
                self.targetEntities[key].update(uris)
                
                
        
    
    
    
    def getEntitiesAndCreateInstancesTable(self, table_name):
        
        if table_name in self.targetEntities:
        
            for ent in self.targetEntities[table_name]:
                
                if self.isValidURI(ent):
                    self.entities.add(ent)
                    e_uri = URIRef(ent)                    
                    self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) )
                #else:
                #    pass
            
            
            
        
    

    def getEntitiesAndCreateInstances(self, cea_file):
        
        with open(cea_file) as csv_file:
            
            csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
            
            for row in csv_reader:
                
                if len(row)<4:
                    continue
                
                
                uris = row[3].split(" ")
                
                for i in range(len(uris)):
                    self.entities.add(uris[i])
                    
                i=1
                while i<len(uris):
                    
                    if self.isValidURI(uris[0]) and self.isValidURI(uris[i]):
                        e_uri1 = URIRef(uris[0])
                        e_uri2 = URIRef(uris[i])
                        self.rdfgraph.add( (e_uri1, URIRef(OWL.SAMEAS), e_uri2) )
                    else:
                        pass
                        #print("Not valid URI?", uris[0], uris[i])
                    i+=1
                    
                    
                    
            for ent in self.entities:
                if self.isValidURI(ent):
                    e_uri = URIRef(ent)
                    self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) )
                else:
                    pass
                    #print("Not valid URI:", ent)
                
            print("Number of entities: " + str(len(self.entities)))
    
    
    
    
    def getTargetColumns(self, cea_gt_file):
        
        self.target_column = dict()
               
        #An alternative is to automatically identify the left most column with an entity mention.
        #In this particular case we know the target
        with open(cea_gt_file) as csv_file:
            
            csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
                        
            for row in csv_reader:
                
                if row[0] not in self.target_column or int(self.target_column[row[0]]) > int(row[1]):
                    self.target_column[row[0]] = row[1] 
                    
                    
                    
    
    def getEntitiesLookup(self, folder_cea_tables, cea_gt_file):
                
        #Lookup call for each cell in target column
        
        
        #Dictionary or cache to avoid repeated look-up
        visited_values = set() 
        
        
        #Get Target column
        self.getTargetColumns(cea_gt_file)
        
        csv_file_names = [f for f in listdir(folder_cea_tables) if isfile(join(folder_cea_tables, f))]
        
        i=0
        n=len(csv_file_names)
        t=[1, 5, 10, 50, 100, 250, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000]

        for csv_file in csv_file_names:
            
            i+=1
            
            if i in t:
                print("Getting look up entities for table %s of %s (%s)." % (i, n, datetime.datetime.now().time()))
            
            table_name = csv_file.replace(".csv", "")
            
            with open(join(folder_cea_tables, csv_file)) as csv_file:
            
                csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
                
                if table_name in self.target_column:
                    target_column = self.target_column[table_name]
                else: #End
                    continue
                    
                for row in csv_reader:
                    if len(row)<=int(target_column): 
                        continue
                    
                    if row[int(target_column)] not in visited_values:
                        
                        ##To avoid repetition
                        visited_values.add(row[int(target_column)])
                               
                        #Lookup top-3
                        dbpedia_entities = self.lookup.getKGEntities(row[int(target_column)], 3, '')
                        for ent in dbpedia_entities:
                            
                            if self.isValidURI(ent.getId()):
                                self.entities.add(ent.getId()) ##Add to entities to extract neighbours
                                
                                e_uri = URIRef(ent.getId())
                                self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) )
                                
                                for cls_type in ent.getTypes(KG.DBpedia):
                                    self.rdfgraph.add( (e_uri, RDF.type, URIRef(cls_type)) )
                                
                            else:
                                #print("Not valid URI:", ent.getId())
                                pass
                            
                            
                            
                            
                            
                    
        
        print("Number of extended entities with look-up: " + str(len(self.entities)))
        
    
    
    def getEntitiesLookupForTable(self, csv_file):
                
        #Lookup call for each cell in target column
        
        
        #Dictionary or cache to avoid repeated look-up
        visited_values = set() 
            
        table_name = csv_file.replace(".csv", "")
            
        with open(join(folder_cea_tables, csv_file)) as csv_file:
            
                csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
                
                if table_name in self.target_column:
                    target_column = self.target_column[table_name]
                else: #End
                    return
                    
                for row in csv_reader:
                    if len(row)<=int(target_column): 
                        return
                    
                    if row[int(target_column)] not in visited_values:
                        
                        ##To avoid repetition
                        visited_values.add(row[int(target_column)])
                               
                        #Lookup top-3
                        dbpedia_entities = self.lookup.getKGEntities(row[int(target_column)], 3, '')
                        for ent in dbpedia_entities:
                            
                            if self.isValidURI(ent.getId()):
                                self.entities.add(ent.getId()) ##Add to entities to extract neighbours
                                
                                e_uri = URIRef(ent.getId())
                                self.rdfgraph.add( (e_uri, RDF.type, URIRef(OWL.NAMEDINDIVIDUAL)) )
                                
                                for cls_type in ent.getTypes(KG.DBpedia):
                                    self.rdfgraph.add( (e_uri, RDF.type, URIRef(cls_type)) )
                                
                            else:
                                #print("Not valid URI:", ent.getId())
                                pass
                            
                            
       
            
    
    
    def getTypes(self, cta_file):
        with open(cta_file) as csv_file:
            
            csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', escapechar="\\")
            
            for row in csv_reader:
                
                if len(row)<3:
                    continue
                
                
                self.types.add(row[2])
                
            print("Number of types: " + str(len(self.types)))



    def getAssertionsForInstances(self, use_local_graph):
        
        #avoid some properties (see entity.py)

        #Differentiate between object and data properties? probably only necessary if literal or URI
        
        #Problem if range of property is not string. It will probably not match very well in any case.
        #Solution: remove domains and ranges in dbpedia ontology properties
        #Filter by dbpedia resources and types, eg: ignore URis from wikidata and types from other taxonomies.
        
        n=0
        
        l=[1, 5, 100, 1000, 2000, 3000, 4000, 5000, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000]
        
        
        for ent in self.entities:
            
            n+=1
            
            if self.isValidURI(ent):
            
                e_uri = URIRef(ent)
                
                #if n in l:
                #    print("Extracting neighbourhood for " + str(n) + ": " + ent + " (" + str(datetime.datetime.now().time()) + ")")
                
                
                if use_local_graph:
                    dict_results = self.getLocalTriplesForSubject(ent, 100)
                else:
                    dict_results = self.dbp_endpoint.getTriplesForSubject(ent, 100)
                
                
                
                for prop in dict_results:
                    
                    #if prop.startswith("http://dbpedia.org/"): #There are othe rinteresting properties: rdfs:labels, rdf:type, fiar:nameetc
                        
                        if self.isValidURI(prop):
                        
                            p_uri = URIRef(prop)
                            
                            isObjectProperty = self.identifyTypeOfProperty(prop)
                            
                            
                            for obj in dict_results[prop]:
                                
                                #Triple to resource
                                if obj.startswith("http") and isObjectProperty:
                                    
                                    if obj.startswith("http://dbpedia.org/resource/"):
                                    
                                        if self.isValidURI(obj):                                
                                            o_uri = URIRef(obj)                                
                                            self.rdfgraph.add( (e_uri, p_uri, o_uri) )
                                        else:
                                            #print("Not valid URI:", obj)
                                            pass
                                
                                elif not isObjectProperty: #Triple to Literal                            
                                    self.rdfgraph.add( (e_uri, p_uri, Literal(obj)) )
                                else:
                                    #print("Wrong object '%s' for property '%s' (isObjectProperty=%s)" % (obj, prop, isObjectProperty) )
                                    pass
                        else:
                            #print("Not valid URI:", prop)
                            pass
            else:
                pass
                #print("Not valid URI:", ent)
                    
                    
    def getLocalTriplesForSubject(self, ent, limit):
        
        query_str = "SELECT DISTINCT ?p ?o WHERE { <" + ent + "> ?p ?o  } limit " + str(limit)
        
        query_object = prepareQuery(query_str)#, initNs={CMR_QA.NAMESPACE_PREFIX : CMR_QA.BASE_URI})
            
        results = self.localrdfgraph.query(query_object)
        
        assertions = dict()
        
        
        for result in results:
            #print(str(result[0]), str(result[1]))
            prop = str(result[0])
            obj = str(result[1])
            if prop not in assertions:
                assertions[prop]=set()
            assertions[prop].add(obj)
        
        #print(assertions)
        
        return assertions
        
            
    def identifyTypeOfProperty(self, prop):
        
        if prop in self.propertyType:
            if self.propertyType[prop]:
                self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLOBJECTPROPERTY)) )
            else:
                self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLDATAPROPERTY)) )  
            
            return self.propertyType[prop]
        
        #Get statistics from endpoint        
        values = self.dbp_endpoint.getSomeValuesForPredicate(prop)
        
        n_values = len(values)
        n_uris = 0
        
        for v in values:
            if v.startswith("http"):
                n_uris+=1
        
        isObjectProperty =  (n_uris > (n_values/2))
        
        if isObjectProperty:
            self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLOBJECTPROPERTY)) )
            self.propertyType[prop]=True                                                                    
        else:
            self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLDATAPROPERTY)) )
            self.propertyType[prop]=False
            
        
        return isObjectProperty
        
        
    
    
    def getInstancesForTypes(self):
        #Use basic method
        
        additional_entities = set()
        
        for cls in self.types:
            
            #print("Extracting members for: " + cls)
            additional_entities = self.dbp_endpoint.getEntitiesForType(cls, 0, 100)
            
            #We also want to extract neighbourhood
            self.entities.update(additional_entities)
            
            for ent in additional_entities:
                
                if self.isValidURI(ent):
                    e_uri = URIRef(ent)
                    if cls.startswith("http://dbpedia.org/"):
                        self.rdfgraph.add( (e_uri, RDF.type, URIRef(cls)) )
                else:
                    #print("Not valid URI:", ent)
                    pass
        
        
        print("Number of extended entities with types: " + str(len(self.entities)))
                  


    #Using pre-extracted ttl/cache
    def localPropertyTypeExtractor(self):
    
            
        query_str = "SELECT DISTINCT ?p  WHERE { ?s ?p ?o . } "
        
        query_object = prepareQuery(query_str)#, initNs={CMR_QA.NAMESPACE_PREFIX : CMR_QA.BASE_URI})
        
        predicates = self.localrdfgraph.query(query_object)
        
        print("Using %s local predicates" % (len(predicates)))
        
        for p in predicates:
         
            #print(p)
            #continue
            prop = str(p[0])
            
            #print(prop)
            
            if not prop.startswith("http://dbpedia.org/"):  
                #we ignore other type of properties. Focus on dbpedia ones. 
                #Others will be trreates as annotation (rdfs:label, foaf:name) or specially (rdf:type)
                continue
            
        
            query_str = "SELECT ?value WHERE { ?s <" + prop + "> ?value . } limit 100"
            
            #print(query_str)
            #continue
            #print("lalala")
            
            query_object = prepareQuery(query_str)#, initNs={CMR_QA.NAMESPACE_PREFIX : CMR_QA.BASE_URI})
            
            values = self.localrdfgraph.query(query_object)
            
            n_values = len(values)
            n_uris = 0
            
            for v in values:
                #print(v[0])
                if str(v[0]).startswith("http"):
                    n_uris+=1
            
            
            if n_values==1:
                isObjectProperty = (n_uris == n_values)
            else:   
                isObjectProperty = (n_uris > (n_values/2))
            
            #print("New: " + prop)
            if isObjectProperty:                
                #self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLOBJECTPROPERTY)) )
                self.propertyType[prop]=True                                                                    
            else:
                #self.rdfgraph.add( (URIRef(prop),  RDF.type, URIRef(OWL.OWLDATAPROPERTY)) )
                self.propertyType[prop]=False
Exemplo n.º 10
0
    def __init__(self):

        self.smartlookup = Lookup()
        self.smartendpoint = Endpoint()
        self.dbpedia_ep = DBpediaEndpoint()
Exemplo n.º 11
0
class JSONUtilities(object):
    def __init__(self):

        self.smartlookup = Lookup()
        self.smartendpoint = Endpoint()
        self.dbpedia_ep = DBpediaEndpoint()

    def validateEntityToClasses(self, path, file_in, file_out):

        with open(path + file_in) as f:
            data = json.load(f)

        data_new = dict()

        no_types = 0
        empty_ref = 0
        missing_cases = 0
        wrong_cases = 0
        empty_cases = 0

        tmp_f = open(path + file_out.replace('.json', '') + '.csv', 'w')
        tmp_f2 = open(path + file_out.replace('.json', '') + '_issues.csv',
                      'w')

        for entity in data:

            types_tocheck = set(data[entity])
            types_ref = self.smartlookup.getTypesForEntity(entity, KG.DBpedia)

            if is_empty(types_ref):

                if is_empty(types_tocheck):
                    #Some issues with disambiguation pages
                    no_types += 1
                else:
                    ##Solved!
                    empty_ref += 1  #Some uris are redirects...

                #We use the original types
                data_new[entity] = data[entity]

                tmp_f.write('%s,%s\n' % (entity, ",".join(types_tocheck)))

                continue

            #New set of corrected types
            data_new[entity] = list(types_ref)  #json expects a list

            tmp_f.write('%s,%s\n' % (entity, ",".join(types_ref)))

            #print("Checking", entity, len(types_ref), len(types_tocheck))
            #print("Checking %s: %s vs %s" % (entity, types_ref, types_tocheck))

            #Statistics
            missing = types_ref.difference(types_tocheck)
            wrong = types_tocheck.difference(types_ref)

            if len(missing) > 0 or len(wrong) > 0:
                print("Issues with: " + entity)
                if len(missing) > 0:
                    print("\tMissing types: ", missing)
                    missing_cases += 1
                    if len(types_tocheck) == 0:
                        empty_cases += 1

                if len(wrong) > 0:
                    print("\tWrong types", wrong)
                    wrong_cases += 1

                tmp_f2.write("Entity,%s.\nMising,%s\nWrong:%s\n" %
                             (entity, ",".join(missing), ",".join(wrong)))

        #We save the new types
        self.dumpJsonFile(data_new, path + file_out)

        tmp_f2.write("Cases with wrong types: %s\n" % (str(wrong_cases)))
        tmp_f2.write("Cases with missing types: %s\n" % (str(missing_cases)))
        tmp_f2.write("Cases with empty types : %s\n" % (str(empty_cases)))
        tmp_f2.write("Cases with empty new types: %s\n" % (str(empty_ref)))
        tmp_f2.write("Cases with no types at all: %s\n" % (str(no_types)))

        tmp_f.close()
        tmp_f2.close()

        print("Cases with wrong types: " + str(wrong_cases))
        print("Cases with missing types: " + str(missing_cases))
        print("Cases with empty types: " + str(empty_cases))
        print("Cases with empty new types: " + str(empty_ref))
        print("Cases with no types at all: " + str(no_types))

    def createTriplesForClasses(self, path, class_file_r, class_file_s,
                                file_out):

        tmp_f = open(path + file_out.replace('.json', '') + '.csv', 'a+')

        #Read candidate classes
        classes = set()
        e_classes = json.load(open(path + class_file_r))
        for c_list in e_classes.values():
            for c in c_list:
                classes.add(c)

        #print(len(classes))

        e_classes = json.load(open(path + class_file_s))
        for c_list in e_classes.values():
            for c in c_list:
                classes.add(c)

        #print(len(classes))

        #Play with different numbers depending on the cost....
        #For each class extract 50-100-200 entities

        #Tests
        #entities = self.smartendpoint.getEntitiesForDBPediaClass("http://dbpedia.org/ontology/BaseballTeam", 100)
        #for e, label in entities.items():
        #    print(e, list(label)[0])
        #classes = set()

        #Dict to convert to jason
        #class_triples = dict()
        cache_file = path + file_out
        class_triples = json.load(
            open(cache_file)) if os.path.exists(cache_file) else dict()

        print("Class triples initial size", str(len(class_triples)))

        for c_uri in classes:

            print(c_uri)

            if c_uri in class_triples:  #already analysed/cached
                print("\tAlready cached!")
                continue

            #if len(class_triples)>5:
            #    break

            i = time.time()

            tmp_f.write('%s\n' % (c_uri))

            #Dictionary entity-label
            entities = self.smartendpoint.getEntitiesForDBPediaClass(
                c_uri, 500)

            #For each above entity (?o) extract triples ?s ?p ?o, together with the label of ?o
            #Extract from 10-50 triples for entity, filter ?p NOT IN: show top ones we aim at discard

            triples = list()

            for object_uri in entities:
                '''
                '''
                #label
                label = list(entities[object_uri])[0]

                #Triples for object entity
                subjects_predicates = self.dbpedia_ep.getTriplesForObject(
                    object_uri, 50)

                for subject in subjects_predicates:
                    for predicate in subjects_predicates[subject]:

                        triple = [subject, predicate, object_uri, label]
                        triples.append(triple)

                        tmp_f.write('%s\n' % (",".join(triple)))

            #end for entities

            print("\tTriples", len(triples))
            class_triples[c_uri] = triples

            #We dump, so that if it we breaks we can continue from there
            self.dumpJsonFile(class_triples, path + file_out)

            e = time.time()

            print("Time:", e - i)

        #end for classes

        #We save the new triples
        tmp_f.close()
        print(len(class_triples), path + file_out)
        self.dumpJsonFile(class_triples, path + file_out)

    #TBC
    def validateClassTriples(self, file):

        with open(file) as f:
            data = json.load(f)

            predicate_count = dict()

            n_triples = 0

            empty_entries = 0

            for entity in data:
                subjects = set()
                predicates = set()
                objects = set()

                print(entity, len(data[entity]))

                if len(data[entity]) == 0:
                    empty_entries += 1

                n_triples += len(data[entity])

                n_triples_class = 0

                for triple in data[entity]:

                    if triple[1] in URI_KG.avoid_predicates:
                        continue

                    if not triple[1].startswith(
                            URI_KG.dbpedia_uri) and not triple[1].startswith(
                                URI_KG.dbpedia_uri_property):
                        continue

                    n_triples_class += 1

                    subjects.add(triple[0])

                    if triple[1] not in predicate_count:
                        predicate_count[triple[1]] = 0

                    predicate_count[triple[1]] += 1

                    predicates.add(triple[1])
                    objects.add(triple[2])
                    #print("\t",data[entity][0])

                print(
                    "\t Different Triples, Subjects, predicates, objects: %s, %s, %s, %s"
                    % (str(n_triples_class), str(len(subjects)),
                       str(len(predicates)), str(len(objects))))

            print("Empty entries", empty_entries)

            predicate_count_sorted = OrderedDict(
                sorted(predicate_count.items(), key=lambda x: x[1]))

            #for k, v in predicate_count_sorted.items():
            #    print(k, v)

            print(len(data), n_triples)

    def dumpJsonFile(self, data_json, file):
        with open(file, "w") as write_file:
            json.dump(data_json, write_file)
Exemplo n.º 12
0
class Endpoint(object):
    '''
    This class aim at identifying errors in DBpedia ENDPOINT when retrieving samples for training
    Positive/negative samples for candidate classes 
    '''
    '''
    def queryTripleByClass(top_k, c):
    triples = list()
    s = sparql.Service(SPARQL_END_POINT, "utf-8", "GET")
    statement = 'select distinct str(?s), str(?p), str(?o), str(?l) where {?s ?p ?o. ?o rdf:type <%s>. ' \
                '?o rdfs:label ?l. FILTER( langMatches(lang(?l), "en"))} ORDER BY RAND() limit %d' % (c, top_k)
    result = s.query(statement)
    for row in result.fetchone():
        triples.append([row[0], row[1], row[2], row[3]])
    return triples
    '''
    def __init__(self):
        '''
        Constructor
        '''

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()

        self.lookup = Lookup()

    def __analyseEntityPredicateStrategy(self, ent, cls_uri):
        '''
        Analyses correctness of cls_uri as type of ent using Predicate types strategy
        '''

        predicate_types = self.lookup.__getTypesPredicateStrategy(ent)

        if len(predicate_types) == 0:
            return None

        if cls_uri in predicate_types:
            return True

        return False

    def __analyseEntityLooukStrategy(self, ent, cls_uri):
        '''
        Analyses correctness of cls_uri as type of ent using Look-up types
        '''

        #Note that if not look-up types, then we return the sparql types as they are
        ##IF lookup types, the sparql types must be compatible
        clean_lookup_types = self.lookup.getTypesForEntity(ent, KG.DBpedia)

        if len(clean_lookup_types) == 0:
            return None

        if cls_uri in clean_lookup_types:
            return True

        return False

    def __analyseEntityWikidataStrategy(self, ent, cls_uri, wikidata_classes):
        '''
        Analyses correcteness of cls_uri as type of ent using wikidata
        '''

        #b. Get equivalent wikidata entity (if any)
        same_entities = self.dbpedia_ep.getSameEntities(ent)

        wikidata_entities = getFilteredResources(
            same_entities, KG.Wikidata)  ##typically one entity

        ##If no equivalent entities we then go for the lookup strategy
        if len(wikidata_entities) == 0:
            return self.__analyseEntityLooukStrategy(ent, cls_uri)

        #print(wikidata_entities)
        for wk_ent in wikidata_entities:
            #c. Check if wikidata type from (a) is within types of equivalent entity from (b)

            #print("\t"+wk_ent)
            wk_ent_types = self.wikidata_ep.getAllTypesForEntity(
                wk_ent)  #we consider supertypes to extend compatibility
            time.sleep(0.01)  #to avoid limit of calls

            intersect = wk_ent_types.intersection(wikidata_classes)

            if len(intersect) > 0:
                return True

        return False

    def getEntitiesForDBPediaClass(self, cls_uri, limit=1000):
        '''
        It currently expects a URL from DBpedia
        '''

        ##We query a subset of entities for sampling
        clean_db_entities = dict()

        offset = 0

        #To guarantee the required number of (clean) entities for the class
        while len(clean_db_entities) < limit:

            #db_entities = self.dbpedia_ep.getEntitiesForType(cls_uri, offset*limit*5, limit*5) #We extract more than required as many of them will be noisy
            db_entities = self.dbpedia_ep.getEntitiesLabelsForType(
                cls_uri, offset * limit * 5, limit * 5)
            #print("Entities",len(db_entities))

            #For wikidata strategy
            #a. Get equivalent class from wikidata (if any)
            db_eq_cls = self.dbpedia_ep.getEquivalentClasses(cls_uri)
            wikidata_classes = getFilteredTypes(
                db_eq_cls, KG.Wikidata)  ##typically one class

            filtered_look_up = 0
            filtered_wikidata = 0
            filtered_predicates = 0

            for ent in db_entities:
                if len(clean_db_entities) >= limit:
                    print("%d, %d, %d, %d" %
                          (len(clean_db_entities), filtered_look_up,
                           filtered_predicates, filtered_wikidata))
                    return clean_db_entities

                results_look_up = self.__analyseEntityLooukStrategy(
                    ent, cls_uri)

                if results_look_up == None:

                    results_predicates = self.__analyseEntityPredicateStrategy(
                        ent, cls_uri)

                    if results_predicates == None:

                        if self.__analyseEntityWikidataStrategy(
                                ent, cls_uri, wikidata_classes
                        ):  #wikidata strategy (it is very costly)
                            clean_db_entities[ent] = db_entities[ent]
                        else:
                            filtered_wikidata += 1
                            #print ("Entity filtered by wikidata", ent)

                    elif results_predicates:  #passed predicates strategy
                        clean_db_entities[ent] = db_entities[ent]
                    else:
                        #print ("Entity filtered by predicates", ent)
                        filtered_predicates += 1

                elif results_look_up:  #passed look-up strategy
                    clean_db_entities[ent] = db_entities[ent]
                else:
                    #print ("Entity filtered by look-up", ent)
                    filtered_look_up += 1

            #OLD STRATEGY: too slow
            #if (len(wikidata_classes)==0): ## No wikidata class then look-up strategy
            #    for ent in db_entities:
            #        if len(clean_db_entities)>=limit:
            #            return clean_db_entities
            #        if self.__analyseEntityLooukStrategy(ent, cls_uri):
            #            clean_db_entities.add(ent)
            #else:
            #    for ent in db_entities:
            #        if len(clean_db_entities)>=limit:
            #            return clean_db_entities
            #        if self.__analyseEntityWikidataStrategy(ent, cls_uri, wikidata_classes):
            #            clean_db_entities.add(ent)

            #print(len(clean_db_entities))
            offset += 1

            #Limit of iterations
            if offset > 5:
                print("%d, %d, %d, %d" %
                      (len(clean_db_entities), filtered_look_up,
                       filtered_predicates, filtered_wikidata))
                return clean_db_entities

        print("%d, %d, %d, %d" % (len(clean_db_entities), filtered_look_up,
                                  filtered_predicates, filtered_wikidata))
        return clean_db_entities
Exemplo n.º 13
0
class Lookup(object):
    '''    
    This class aim at providing a lookup access to the KG leading to minimal errors
    It will also optionally combine. No only one KG but several 
    '''
    def __init__(self):  #KGraph=KG.DBpedia
        '''
        Constructor
        '''
        #Return types from this knowledge graph
        #self.KGraph = KGraph

        self.dbpedia_onto = DBpediaOntology()
        self.dbpedia_onto.loadOntology(True)
        self.schema_onto = SchemaOrgOntology()
        self.schema_onto.loadOntology(True)

        self.dbpedia_ep = DBpediaEndpoint()
        self.wikidata_ep = WikidataEndpoint()

    def getTypesForEntity(self, uri_entity, kg=KG.DBpedia):

        #print("QUERY",uri_entity)

        if kg == KG.DBpedia:
            types = set()
            types_redirects = set()

            #Original entity
            types.update(self.getTypesForDBPediaEntity(uri_entity))

            #Redirects if any
            #See dbo:wikiPageRedirects -> similar to same_as inside dbpedia
            redirects = self.dbpedia_ep.getWikiPageRedirect(uri_entity)

            for uri_redirect in redirects:  #Typically only one
                types_redirects.update(
                    self.getTypesForDBPediaEntity(uri_redirect))

            if len(types) == 0:  #We use the ones of the redirects
                types.update(types_redirects)
            else:  #types of redirects can be dirty
                for t in types_redirects:
                    if self.__checkCompatibilityTypes(t, types):
                        types.add(t)

            #Commented because it was slow for a large dataset
            #if is_empty(types) or (len(types)==1 and "Agent" in list(types)[0]):
            #    #Wikidata strategy to complement if empty endpoint and look-up or only type "Agent"
            #    print(types)
            #    types.update(self.__getTypesWikidataStrategy(uri_entity))
            #    print(types)

            return types

        #TBC
        elif kg == KG.Wikidata:
            pass
        elif kg == KG.Google:
            pass

        return set()

    def __getTypesLookupStrategy(self, uri_entity):

        kg = KG.DBpedia

        label = uri_entity
        if uri_entity.startswith(URI_KG.dbpedia_uri_resource):
            label = uri_entity.replace(URI_KG.dbpedia_uri_resource, '')

        ##we call our method to get look-up types for the URI. Only SPARQL endpoint types may contain errors
        #It also includes wikidata strategy inside
        entities = self.getKGEntities(label, 10,
                                      uri_entity)  #fulter by uri_entity

        ##In case not match in look up
        if is_empty(entities):

            types = set()

            #Dbpedia Enpoint strategy
            types_endpoint = getFilteredTypes(
                self.dbpedia_ep.getAllTypesForEntity(uri_entity), KG.DBpedia)

            #Predicates strategy (uses top types)
            types_domain_range = self.__getTypesPredicateStrategy(uri_entity)

            if len(types_domain_range) > 0:

                #They can be noisy, so do not add them yet
                #types.update(types_domain_range)

                ##Check compatibility of types_endpoint
                for t in types_endpoint:

                    if t not in types_domain_range:
                        if self.__checkCompatibilityTypes(
                                t, types_domain_range):
                            types.add(t)

                #If no compatible types we just use the ones coming from domain/ranges
                if len(types) > 0:
                    types.update(types_domain_range)

            #If still empty we use
            if len(types) == 0:
                #We add endpoint types
                types.update(types_endpoint)

            return types

        else:
            ##shoudl be only one element from lookup
            for entity in entities:
                return entity.getTypes(kg)

    def __getTypesPredicateStrategy(self, uri_entity):
        '''
        Exploits the domain and range types of the predicates in triples with uri_entity as subject or object
        '''

        types = set()

        #Top-2
        types.update(
            getFilteredTypes(
                self.dbpedia_ep.getTopTypesUsingPredicatesForObject(
                    uri_entity, 2), KG.DBpedia))

        #We use top one here, as there less properties associated to an entity. And We only need one wrong to be in the top-k

        #Top-1
        #Error-prone as many properties are not properly used
        #Only uses if current range types are compatible
        types_domain = getFilteredTypes(
            self.dbpedia_ep.getTopTypesUsingPredicatesForSubject(
                uri_entity, 1), KG.DBpedia)

        #if len(types)==0:
        #    types.update(types_domain)
        #else:
        if len(types) > 0:
            for t in types_domain:
                if self.__checkCompatibilityTypes(t, types):
                    types.add(t)

        #TODO: Alternative Strategies: use intersection of types_range and types_domain in non empty. May increase recall (remove min input/output edges in queries) and precision
        #If empty then use as now.

        return types

    def __getTypesWikidataStrategy(self, uri_entity):

        print("\tUsing wikidata strategy for " + uri_entity)

        #Gets equivalent wikidata entities
        same_entities = self.dbpedia_ep.getSameEntities(uri_entity)
        wikidata_entities = getFilteredResources(
            same_entities, KG.Wikidata)  ##typically one entity

        wk_ent_types = set()
        dp_types = set()
        dp_types_all = set()

        if len(wikidata_entities) == 0:
            return wk_ent_types

        for wk_ent in wikidata_entities:

            #print("WK ent: "+wk_ent)

            #Get types for wikidata entities
            wk_ent_types.update(self.wikidata_ep.getAllTypesForEntity(
                wk_ent))  #we consider all supertypes to extend compatibility

            #Problematic concept
            #Wikimedia disambiguation page
            if URI_KG.wikimedia_disambiguation_concept in wk_ent_types:
                wk_ent_types.clear()

            #Check if: wk_ent_types

        for t in wk_ent_types:
            #print("WK cls: " +t)
            #Get equivalent dbpedia types
            #print(self.wikidata_ep.getEquivalentClasses(t))
            dp_types.update(
                getFilteredTypes(self.wikidata_ep.getEquivalentClasses(t),
                                 KG.DBpedia))

        #get superclasses
        for t in dp_types:
            #print("DBp type: " +t)
            dp_types_all.update(self.dbpedia_ep.getAllSuperClasses(t))

        return getFilteredTypes(dp_types_all, KG.DBpedia)

    def getTypesForDBPediaEntity(self, uri_entity):

        #types=set()

        #Types from DBpedia Endpoint may be dirty. So we use 2 strategies: wikidata and lookup types
        #TODO: check compatibility among strategies?

        #Look-up strategy  also includes wikidata strategy
        types = self.__getTypesLookupStrategy(uri_entity)

        #print("Main", types)
        return types

        ##Additional strategy...
        #Check equivalent entity from wikidata, get classes from wikidata, get equivalent in dbpedia enpoint

    def getKGEntities(self, cell, limit=5, filter=''):
        '''
        Given the text of a cell extracts entity objects.
        Note that an entity contains an id, a label, a description, a set of types from dbpedia, wikidata and schema.org,
        and the source (dbpedia, wikidata or google)
        '''

        #Strategy:
        #0. Incremental repair: start with sth simple
        #1. Identify cases where dbpedia lookup and endpoint do not agree
        #2. We have dbpedia (may require a fix...) and schema.org taxonomies to identify conflicting branches
        #3. Use alignment to identify same entities and have more evidence about types
        #3a. Lexical
        #3b. Based on embeddings
        #4. If not provided mapping among classes (there seem to be available mappings), we may use alignment as well (lexical and embedding)

        query = cell

        #Get KG entities from DBpedia, Wikidata and KG
        #One could also return the most accurate 5 types combining the 3 KGs... (limit 20 of each of them and then retrieve top-k)
        dbpedia = DBpediaLookup()
        dbpedia_entities = dbpedia.getKGEntities(query, limit, filter)

        #We complement with types from endpoint and check if they are correct/compatible
        for entity in dbpedia_entities:
            self.__analyseEntityTypes(entity)

        return dbpedia_entities

        #Next steps
        #Find equivalent entities from wikidata (using both wikidata and dbpedia endpoints),
        #then its types and then try to find conflictive types (it could even be by voting)
        '''
        kg = GoogleKGLookup()
         
        wikidata = WikidataAPI()
        
       
        '''

    def __analyseEntityTypes(self, entity):

        #print(entity.getId())

        #print("\t"+str(entity.getTypes(KG.DBpedia)))

        #Filter by type?
        types_endpoint = getFilteredTypes(
            self.dbpedia_ep.getAllTypesForEntity(entity.getId()), KG.DBpedia)

        #print("\t"+str(types_endpoint))

        if len(entity.getTypes()) > 0:

            for t in types_endpoint:

                if t not in entity.getTypes():

                    ##Evaluate compatibility with lookup types.
                    ##In same branch
                    ##We use DBpedia for now
                    if self.__checkCompatibilityTypes(
                            t, entity.getTypes(KG.DBpedia)):
                        entity.addType(t)

        else:  #No types from lookup

            #We use wikidata strategy
            #Not great for compatibility as we need to better explore the returned types
            #types_wk_strategy = self.__getTypesWikidataStrategy(entity.getId())

            #We use range-domain-predicate strategy (uses top-types)
            types_domain_range = self.__getTypesPredicateStrategy(
                entity.getId())

            if len(types_domain_range) > 0:

                #They can be noisy, so do not add them yet
                #entity.addTypes(types_domain_range)

                ##Check compatibility of types_endpoint
                for t in types_endpoint:

                    if t not in types_domain_range:
                        if self.__checkCompatibilityTypes(
                                t, types_domain_range):
                            entity.addType(t)

                #DANGEROUS, as domain and range types contain amny errors
                #If no compatible type we just use the ones coming from domain/ranges
                #if len(entity.getTypes())>0:
                #    entity.addTypes(types_domain_range)

            #If still empty we use endpoint
            if len(entity.getTypes()) == 0:
                #We add endpoint types
                entity.addTypes(types_endpoint)

            ##Last resource if not types
            if len(entity.getTypes()) > 0:
                entity.addTypes(types_domain_range)

            #We complement with wikidata strategy
            #entity.addTypes(types_wk_strategy)

        #print("\t"+str(entity.getTypes(KG.DBpedia)))

    '''
    We check if the source type (endpoint) is among descendants or ancestors of at least one of the target types (lookup)
    '''

    def __checkCompatibilityTypes(self, cls_source_uri, target_types):

        for cls_target_uri in target_types:
            if self.__isCompatibleType(cls_source_uri, cls_target_uri):
                return True

        return False

    '''
    We check if the source type is among descendants or ancestors of the target type
    '''

    def __isCompatibleType(self, cls_source_uri, cls_target_uri):

        cls_source = self.dbpedia_onto.getClassByURI(cls_source_uri)
        cls_target = self.dbpedia_onto.getClassByURI(cls_target_uri)

        ##TODO  We rely on DBpedia only for now
        if cls_source == None or cls_target == None:
            return False

        ancestors = self.dbpedia_onto.getAncestorsURIs(cls_target)
        descendants = self.dbpedia_onto.getDescendantURIs(cls_target)

        if cls_source_uri in ancestors or cls_source_uri in descendants:
            return True

        return False
''' author: Eleanor Bill @eljne '''
''' create vectors for additional training data - +ve - CONTINUED'''
''' takes about four-five hours w/10 samples per question - 180,000 '''

from kg.EB_classes import pickl, unpickle, nouns_list, noun_phrases_list
import pandas as pd
from kg.endpoints import DBpediaEndpoint

pos = unpickle('training_vectors/11_train_new_positive_samples')
new_positive_samples = pd.DataFrame(pos)
print('unpickled')
ep = DBpediaEndpoint()
# print(new_positive_samples.head)


def get_nouns(entity):
    labels = ep.getEnglishLabelsForEntity(entity)
    nouns = nouns_list(labels)
    print('.')
    return nouns


def get_nps(entity):
    labels = ep.getEnglishLabelsForEntity(entity)
    nps = noun_phrases_list(labels)
    print('..')
    return nps


def apply_endpoint_alt(entity):  # find types
    types = ep.getTypesForEntity(entity)  # limit to 5