예제 #1
0
def test_generate():
    path = "tests"

    #dir_path = os.path.dirname(os.path.realpath(path)) + "/" + path
    dir_path = os.getcwd() + "/tests"
    src_path = dir_path + "/test2.turtle"
    dest_path = dir_path + "/test2.hdt"

    print("Source: %s" % src_path)
    print("Dest  : %s" % dest_path)

    doc = hdt.generate_hdt(src_path, "aff4://foo")
    retcode = doc.save_to_hdt(dest_path)
    assert retcode == 0

    document = HDTDocument(dest_path)

    (triples, triplesCard) = document.search_triples("missingtriple", "", "")
    assert triplesCard == 0

    (triples, triplesCard) = document.search_triples("", "", "")
    assert triplesCard == 12

    (triples, triplesCard) = document.search_triples(
        "aff4://5aea2dd0-32b4-4c61-a9db-677654be6f83//test_images/AFF4-L/dream.txt",
        "", "")
    assert triplesCard == 12

    os.unlink(dest_path)
예제 #2
0
def extract_dbpedia(superclass):
    """ Get edgelist for superclass and all its subclasses """
    edgelist = []
    instances = set()
    doc = HDTDocument(run.config["kg_source"])
    subject_limit = run.config["subject_limit"]
    predicate_limit = run.config["predicate_limit"]
    subclasses = query_subclasses(superclass)
    print("[Info] query instances for each subclass")
    for subclass in tqdm(subclasses):
        if subject_limit > 0:
            (triples, count) = doc.search_triples("", rdf + "type", subclass, limit=subject_limit)
        else:
            (triples, count) = doc.search_triples("", rdf + "type", subclass)
        for triple in triples:
            instances.add(triple[0])
    print("[Info] query predicates for each instance")
    for subject in tqdm(instances):
        if predicate_limit > 0:
            triples = doc.search_triples(subject, "", "", limit=predicate_limit)[0]
        else:
            (triples, count) = doc.search_triples(subject, "", "")
        for triple in triples:
            # Either blacklist
            if not triple[1] in blacklist:
                edgelist.append((triple[0], triple[1]))
            # Or whitelist
            # if triple[1] in whitelist:
            #     edgelist.append((triple[0], triple[1]))
    return list(set(edgelist)) # Exclude duplicate entity-property relations
예제 #3
0
def extract_wikidata(classname, typeproperty):
    doc = HDTDocument("kg/wikidata-20170313-all-BETA.hdt")
    wd = "http://www.wikidata.org/entity/"
    wdt = "http://www.wikidata.org/prop/direct/"
    wd_classes = {
        "BoxerWikidata" : "Q11338576",
        "CyclistWikidata": "Q2309784",
        "CapitalWikidata" : "Q5119",
        "CountryWikidata" : "Q6256",
        "MetroAreaWikidata" : "Q1907114",
        "GeographicRegionWikidata" : "Q82794",
        "FilmFestivalWikidata" : "Q220505",
    }
    edgelist = []
    instances = set()
    (triples, count) = doc.search_triples("", f"{wdt}{typeproperty}", f"{wd}{wd_classes[classname]}")

    for triple in triples:
        instances.add(triple[0])

    for instance in tqdm(instances, total=len(instances)):
        (triples, count) = doc.search_triples(instance, "", "")
        for triple in triples:
            if not triple[1] in blacklist:
                edgelist.append((triple[0], triple[1]))

    return list(set(edgelist)) # Exclude duplicate entity-property relations
예제 #4
0
def extract_by_instance(fn, wdt_class, property, out=True):

    doc = HDTDocument(fn)

    wd = "http://www.wikidata.org/entity/"
    wdt = "http://www.wikidata.org/prop/direct/"

    properties = {"instance_of": "P31", "occupation": "P106"}

    instances = set()

    (triples, count) = doc.search_triples("", f"{wdt}{properties[property]}",
                                          f"{wd}{wdt_class}")

    for triple in tqdm(triples, len(list(instances))):
        instances.add(triple[0])

    with open(f'{wdt_class}.csv', "w") as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',')

        for instance in tqdm(instances, total=len(instances)):
            if out:
                pattern = (instance, "", "")
            else:
                pattern = ("", "", instance)

            (triples, count) = doc.search_triples(*pattern)

            for triple in triples:
                if out:
                    spamwriter.writerow([triple[0], triple[1]])
                else:
                    spamwriter.writerow([triple[2], triple[1]])
예제 #5
0
class HDT(KG):
    def __init__(self, hdt_file_path: str):
        self.hdt = HDTDocument(hdt_file_path)

    def predicate_objects(self, subject: str) -> Iterator[Tuple[str, str]]:
        (triples, cardinality) = self.hdt.search_triples(subject, "", "")
        for s, p, o in triples:
            yield p, o

    def subjects(self, predicate: str, obj: str) -> Iterator[str]:
        (triples, cardinality) = self.hdt.search_triples("", predicate, obj)
        for s, p, o in triples:
            yield s

    def triples(self, subject: str, predicate: str,
                obj: str) -> Iterator[Tuple[str, str, str]]:
        (triples,
         cardinality) = self.hdt.search_triples(subject, predicate, obj)
        for s, p, o in triples:
            yield (s, p, o)

    def objects(self, subject: str, predicate: str) -> Iterator[str]:
        (triples,
         cardinality) = self.hdt.search_triples(subject, predicate, "")
        for s, p, o in triples:
            yield o

    def count(self, subject: str, predicate: str, obj: str) -> int:
        (triples,
         cardinality) = self.hdt.search_triples(subject, predicate, obj)
        return cardinality

    def total_triples(self) -> int:
        return self.hdt.total_triples

    def nb_subjects(self) -> int:
        return self.hdt.nb_subjects

    def nb_predicates(self) -> int:
        return self.hdt.nb_predicates

    def nb_objects(self) -> int:
        return self.hdt.nb_objects

    def nb_shared(self) -> int:
        return self.hdt.nb_shared

    def get_schema_description(self, resource: str) -> Optional[str]:
        """Get english description of the specified resource.
        Use the http://schema.org/description property.
        Trailing double quotes and @en are removed!"""
        for o in self.objects(resource, "http://schema.org/description"):
            if o.endswith("@en"):
                # delete trailing @en and double quotes
                input_str = o[1:len(o) - 4]
                # input_str = re.sub(r'\d+', '', input_str) # remove numbers
                # input_str = input_str.translate(str.maketrans("","", string.punctuation)) # Punctuation removal
                # input_str = input_str.strip().lower() # To remove leading and ending spaces and put it in lower case
                return input_str
        return None
예제 #6
0
def generate():
    hdt_file = None
    output_filename = None
    if sys.argv [1] == 'lod':
        hdt_file = HDTDocument(PATH_LOD)
        output_filename = 'all_lod_subClassOf.csv'

    else:
        hdt_file = HDTDocument(PATH_DBpedia)
        output_filename = 'all_dbpedia_subClassOf.csv'

    subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate)
    count = 0
    with open(output_filename, 'w', newline='') as file:
        (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "")
        writer = csv.writer(file)
        writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])
        for (s, p, o) in subclass_triples:
            # store it in a csv file
            s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
            o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
            writer.writerow([s_id, s, o_id, o])
            # print ([s_id, s, o_id, o])
            count += 1
    print ('total entries = ', count)
예제 #7
0
def extract_classes(fn):
    doc = HDTDocument(fn)

    rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
    types = set()
    types_dct = {}
    (triples, count) = doc.search_triples("", rdf_type, "")

    for triple in tqdm(triples, total=count):
        types.add(triple[2])

    for type in tqdm(types):
        (instances, instance_count) = doc.search_triples("", rdf_type, type)
        types_dct[type] = instance_count

    return types_dct
예제 #8
0
def load_KG(path_file, predicate_string, orientation=True):
    # load the file according to the given predicate
    hdt_file = HDTDocument(path_file)
    (triples, cardinality) = hdt_file.search_triples('', predicate_string, '')
    for (s, _, o) in triples:
        if orientation:
            graph.add_edge(s, o)
        else:
            graph.add_edge(o, s)
예제 #9
0
def get_nb_triples(file_path: str, format: str) -> int:
    if format == 'nt':
        return wccount(file_path)
    elif format == 'hdt':
        doc = HDTDocument(file_path, indexed=False)
        _, nb_triples = doc.search_triples("", "", "")
        return nb_triples
    else:
        raise Exception(f'Unsupported RDF format: "{format}"')
예제 #10
0
    def parsefile(self, file_path):
        """Parse an HDT file as an N-Triples file."""

        from hdt import HDTDocument

        doc = HDTDocument(file_path, indexed=False)
        iterator, _ = doc.search_triples("", "", "")
        self.iterator = iterator
        self.parse()
예제 #11
0
class HDTFileConnector(DatabaseConnector):
    """A HDTFileConnector search for RDF triples in a HDT file"""
    def __init__(self, file):
        super(HDTFileConnector, self).__init__()
        self._hdt = HDTDocument(file)

    def search_triples(self, subject, predicate, obj, limit=0, offset=0):
        """
            Get an iterator over all RDF triples matching a triple pattern.

            Args:
                - subject ``string`` - Subject of the triple pattern
                - predicate ``string`` - Predicate of the triple pattern
                - object ``string`` - Object of the triple pattern
                - limit ``int=0`` ``optional`` -  LIMIT modifier, i.e., maximum number of RDF triples to read
                - offset ``int=0`` ``optional`` -  OFFSET modifier, i.e., number of RDF triples to skip

            Returns:
                A Python iterator over RDF triples matching the given triples pattern
        """
        subject = subject if (subject is not None) and (
            not subject.startswith('?')) else ""
        predicate = predicate if (predicate is not None) and (
            not predicate.startswith('?')) else ""
        obj = obj if (obj is not None) and (not obj.startswith('?')) else ""
        return self._hdt.search_triples(subject,
                                        predicate,
                                        obj,
                                        offset=offset,
                                        limit=limit)

    @property
    def nb_triples(self):
        return self._hdt.total_triples

    @property
    def nb_subjects(self):
        """Get the number of subjects in the database"""
        return self._hdt.nb_subjects

    @property
    def nb_predicates(self):
        """Get the number of predicates in the database"""
        return self._hdt.nb_predicates

    @property
    def nb_objects(self):
        """Get the number of objects in the database"""
        return self._hdt.nb_objects

    def from_config(config):
        """Build a HDTFileFactory from a config file"""
        if not os.path.isfile(config["file"]):
            raise Exception("Configuration file not found: {}".format(
                config["file"]))
        return HDTFileConnector(config["file"])
예제 #12
0
def get_rdf_reader(file_path, format='nt'):
    """Get an iterator over RDF triples from a file"""
    iterator = None
    nb_triples = 0
    # load standard RDF formats using rdflib
    if format == 'nt' or format == 'ttl':
        g = Graph()
        g.parse(file_path, format=format)
        nb_triples = len(g)
        iterator = map(__n3_to_str, g.triples((None, None, None)))
    elif format == 'hdt':
        # load HDTDocument without additional indexes
        # they are not needed since we only search by "?s ?p ?o"
        doc = HDTDocument(file_path, indexed=False)
        iterator, nb_triples = doc.search_triples("", "", "")
    return iterator, nb_triples
예제 #13
0
def generate_reduced():
    # Q1 : retrieve the subClassOf relations
    hdt_file = None
    output_filename = None
    output_selfloopClass_filename = None
    output_leafClass_filename = None
    output_intermediateClass_filename = None

    if sys.argv[1] == 'lod':
        hdt_file = HDTDocument(PATH_LOD)
        output_filename = 'reduced_lod_subClassOf.csv'
        output_selfloopClass_filename = 'lod_selfloop_classes.csv'
        output_leafClass_filename = 'lod_leaf_classes.csv'
        output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv'
    else:
        hdt_file = HDTDocument(PATH_DBpedia)
        output_filename = 'dbpedia_subClassOf.csv'
        output_selfloopClass_filename = 'dbpedia_selfloop_classes.csv'
        output_leafClass_filename = 'dbpedia_leaf_classes.csv'
        output_intermediateClass_filename = 'further_reduced_dbpedia_subClassOf.csv'

    subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    id_subClassOf = hdt_file.convert_term(
        "http://www.w3.org/2000/01/rdf-schema#subClassOf",
        IdentifierPosition.Predicate)

    (subclass_triples,
     cardinality) = hdt_file.search_triples("", subClassOf, "")

    c1 = ['196338233', '196338418', '196338419']
    c2 = ['196338233', '196338325', '196338412']
    c3 = ['196337995', '196338014', '196338013']
    c4 = ['196338014', '196338063', '196338410']

    cs = [c1, c2, c3, c4]
    for c in cs:
        print('\n\n this cycle = ', c)
        for n in c:
            print('id =', n)
            name = hdt_file.convert_id(int(n), IdentifierPosition.Subject)
            print('name = ', name)
            s_id = hdt_file.convert_term(name, IdentifierPosition.Subject)
            print('when its subject = ', s_id)
            o_id = hdt_file.convert_term(name, IdentifierPosition.Object)
            print('when its object  = ', o_id)

    print('==================================')

    c1 = ['1193056652', '1193056593', '1193056657']
    c2 = ['1146303708', '1146299369', '1146331327']
    c3 = ['196338400', '196338312', '196338288']
    c4 = ['196338013', '196337995', '196338014']
    c5 = ['196338242', '196338410', '196337957']
    c6 = ['196338418', '196338419', '196338233']
    c7 = ['196338233', '196338325', '196338412']
    c8 = ['196338014', '196338063', '196338410']
    c9 = ['196338014', '196337975', '196338007']
    c10 = ['196338050', '196338049', '196337975']
    c11 = ['196338197', '196338462', '196338406']
    c12 = ['196338220', '196338217', '196338034']
    c13 = ['196338145', '196338152', '196338419']
    c14 = ['196338288', '196338116', '196337978']
    c15 = ['196338070', '196338360', '196338241']
    c16 = ['114657709', '114657713', '125181834']

    cs = [
        c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16
    ]
    for c in cs:
        print('\n\n that cycle = ', c)
        for n in c:
            print('id =', n)
            name = hdt_file.convert_id(int(n), IdentifierPosition.Subject)
            print('name = ', name)
            s_id = hdt_file.convert_term(name, IdentifierPosition.Subject)
            print('when its subject = ', s_id)
            o_id = hdt_file.convert_term(name, IdentifierPosition.Object)
            print('when its object  = ', o_id)
예제 #14
0
class HDTAssistedDataStore(MemoryDataStore):
    def __init__(self, lex=lexicon.standard):
        super(HDTAssistedDataStore, self).__init__(lex=lex)
        self.hdt = None

    def invalidateCachedMetadata(self, zip):
        aff4cache = os.path.join(expanduser("~"), ".aff4")
        cached_turtle = os.path.join(aff4cache, "%s.hdt" % str(zip.urn)[7:])
        cached_turtle_index = cached_turtle + ".index.v1-1"
        for f in [cached_turtle, cached_turtle_index]:
            if os.path.exists(f):
                LOGGER.debug("Invalidating HDT index %s" % f)
                os.unlink(f)

    def createHDTviaLib(self, zip, cached_turtle):
        try:
            temp = tempfile.NamedTemporaryFile(delete=False)
            LOGGER.debug("Creating HDT index %s" % cached_turtle)
            LOGGER.debug("Creating temp turtle file for import %s" % temp.name)
            try:
                with zip.OpenZipSegment("information.turtle") as fd:
                    streams.WriteAll(fd, temp)
                temp.close()
            except Exception as e:
                # no turtle yet
                return

            doc = hdt.generate_hdt(temp.name, "aff4://foo")
            retcode = doc.save_to_hdt(cached_turtle)

            if retcode != 0:
                print("rdf2hdt failed", -retcode, file=sys.stderr)
            else:
                pass

        except:
            traceback.print_exc()
            raise Exception("rdf2dht failed. Please make data_store.HAS_HDT=False until this is fixed. ")

        finally:
            os.unlink(temp.name)


    def loadMetadata(self, zip):
        # Load the turtle metadata.
        aff4cache = os.path.join(expanduser("~"), ".aff4")
        if not os.path.exists(aff4cache):
            try:
                os.makedirs(aff4cache)
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise
        cached_turtle = os.path.join(aff4cache, "%s.hdt" % str(zip.urn)[7:])
        if not os.path.exists(cached_turtle):
            self.createHDTviaLib(zip, cached_turtle)

        if os.path.exists(cached_turtle):
            # assume we have a HDT cache of turtle at this point
            self.hdt = HDTDocument(cached_turtle)


    # this implementation currently not tested
    # and it is super ugly. We are materializing all triples just to
    # list all the subjects.
    # TODO: Implement subject iterator in pyHDT
    def QuerySubject(self, graph, subject_regex=None):
        if graph == transient_graph:
            yield super(HDTAssistedDataStore, self).QuerySubject(transient_graph, subject_regex)

        subject_regex = re.compile(utils.SmartStr(subject_regex))
        (triples, cardinality) = self.hdt.search_triples("", "?", "?")
        seen_subject = []

        for (s,p,o) in triples:
            if subject_regex is not None and subject_regex.match(s):
                if s not in seen_subject:
                    seen_subject.add(s)
                    yield rdfvalue.URN().UnSerializeFromString(s)

        for s in super(HDTAssistedDataStore, self).QuerySubject(graph, subject_regex=subject_regex):
            if s not in seen_subject:
                seen_subject.add(s)
                yield s

    # not yet implemented
    def QueryPredicate(self, graph, predicate):
        if graph == transient_graph:
            yield super(HDTAssistedDataStore, self).QueryPredicate(transient_graph, predicate)

        yield super(HDTAssistedDataStore, self).QueryPredicate(graph, predicate)

    def QueryPredicateObject(self, graph, predicate, object):
        (triples, cardinality) = self.hdt.search_triples("", predicate, object)

        for (s,p,o) in triples:
            yield rdfvalue.URN(s)

        for subject in super(HDTAssistedDataStore, self).QueryPredicateObject(graph, predicate, object):
            yield subject

    def Get(self, graph, subject, attribute):
        if self.hdt == None:
            return super(HDTAssistedDataStore, self).Get(graph, subject, attribute)
        else:
            # we use a set here as we some implementations might pass up an object from
            # the persisted graph and the transient graph. The set lets us remove duplicates
            res = set(self.QuerySubjectPredicate(graph, subject, attribute))
            if len(res) == 1:
                return list(res)
            return list(res)

    def QuerySubjectPredicate(self, graph, subject, predicate):
        for o in super(HDTAssistedDataStore, self).QuerySubjectPredicate(graph, subject, predicate):
            yield o

        if self.hdt == None:
            return

        if graph == transient_graph:
            return

        if isinstance(subject, rdfvalue.URN):
            subject = subject.SerializeToString()
        else:
            subject = utils.SmartUnicode(subject)

        if isinstance(predicate, rdfvalue.URN):
            predicate = predicate.SerializeToString()
        else:
            predicate = utils.SmartUnicode(predicate)

        (triples, cardinality) = self.hdt.search_triples(subject, predicate, "")

        for (s,p,o) in triples:
            if o.startswith("\""):
                # it is a literal
                (v,t) = o.split("^^")
                v = v.replace("\"", "")
                t = t[1:len(t)-1]

                datatype = rdflib.URIRef(t)
                if datatype in registry.RDF_TYPE_MAP:
                    o = registry.RDF_TYPE_MAP[datatype](v)
                else:
                    # Default to a string literal.
                    o = rdfvalue.XSDString(v)
            elif o.startswith("<"):
                o = rdfvalue.URN(utils.SmartUnicode(o))
            elif o.startswith("aff4://"):
                o = rdfvalue.URN(utils.SmartUnicode(o))
            else:
                o = rdfvalue.URN(utils.SmartUnicode(o))

            yield o


    def SelectSubjectsByPrefix(self, graph, prefix):
        if graph == transient_graph:
            yield super(HDTAssistedDataStore, self).SelectSubjectsByPrefix(transient_graph, prefix)

        yield super(HDTAssistedDataStore, self).SelectSubjectsByPrefix(graph, prefix)

    def QueryPredicatesBySubject(self, graph, subject):
        if graph == transient_graph:
            yield super(HDTAssistedDataStore, self).QueryPredicatesBySubject(transient_graph, subject)

        yield super(HDTAssistedDataStore, self).QueryPredicatesBySubject(graph, subject)
예제 #15
0
    IdentifierPosition.Predicate)
id_sameAs = hdt_lod.convert_term("http://www.w3.org/2002/07/owl#sameAs",
                                 IdentifierPosition.Predicate)
id_subClassOf = hdt_lod.convert_term(
    "http://www.w3.org/2000/01/rdf-schema#subClassOf",
    IdentifierPosition.Predicate)
id_equivalentClass = hdt_lod.convert_term(
    "http://www.w3.org/2002/07/owl#equivalentClass",
    IdentifierPosition.Predicate)

# output some stats of LOD-a-lot
# we can query the HDT file using the term IDs (e.g. rdf:type and equivalentClass) or the URIs (e.g. subClassOf and sameAs)
print("# subjects:", "{:,}".format(hdt_lod.nb_subjects))
print("# predicates:", "{:,}".format(hdt_lod.nb_predicates))
print("# objects:", "{:,}".format(hdt_lod.nb_objects))
(triples, cardinality) = hdt_lod.search_triples("", "", "")
print("# triples:", "{:,}".format(cardinality))
(triples, cardinality) = hdt_lod.search_triples_ids(0, id_type, 0)
print("# rdf:type statements:", "{:,}".format(cardinality))
(triples, cardinality) = hdt_lod.search_triples(
    "", "http://www.w3.org/2000/01/rdf-schema#subClassOf", "")
print("# rdfs:subClassOf statements:", "{:,}".format(cardinality))
(triples, cardinality) = hdt_lod.search_triples_ids(0, id_equivalentClass, 0)
print("# owl:equivalentClass statements:", "{:,}".format(cardinality))
(triples,
 cardinality) = hdt_lod.search_triples("",
                                       "http://www.w3.org/2002/07/owl#sameAs",
                                       "")
print("# owl:sameAs statements:", "{:,}".format(cardinality))

예제 #16
0
#
# triples, cardinality = hdt.search_triples("", subPropertyOf, t)
# print ('There are ', cardinality, 'subPropertyOf of owl:transitive properties')
# for (s,p ,o) in triples:
#     print ('subPropertyOf: ', s)
#
#
# triples, cardinality = hdt.search_triples("", subClassOf, t)
# print ('There are ', cardinality, 'subclass of owl:transitive properties')
# for (s,p ,o) in triples:
#     print ('subClassOf: ', s)

trans_collect = set()
inv_collect = set()

triples, direct_trans_relations = hdt.search_triples("", type, t)
print('There are ', direct_trans_relations,
      'as typed by owl:transitive properties')
for (s, p, o) in triples:
    trans_collect.add(str(s))
#
# # and another http://www.cyc.com/2003/04/01/cyc#EquivalenceRelation
# cyc_eq = 'http://www.cyc.com/2003/04/01/cyc#EquivalenceRelation'
# triples, cardinality = hdt.search_triples("", type, cyc_eq)
# print ('There are ', cardinality, 'type of cyc#eq properties')
# for (s,p ,o) in triples:
# 	trans_collect.add(str(s))
#
# print ('So in total that is ', len(trans_collect))
count_trans_rel_triples = 0
for trans_rel in trans_collect:
for subj, pred, obj in g:
    count += 1
    collect_triple_owl.add((str(subj), str(pred), str(obj)))
    collect_nodes.add(subj)
    collect_nodes.add(obj)

print('**** In the original OWL scheme ****')
print('there are in total ', len(collect_triple_owl), ' Triples')
print('there are in total ', len(collect_nodes), ' Nodes')

collect_triple_owl_lod = set()
count_relations_between_nodes = Counter()

for s in collect_nodes:
    for o in collect_nodes:
        (triples, cardinality) = hdt_file.search_triples(s, '', o)
        for (s, p, o) in triples:
            # if (s, p ,o) not in collect_triple_owl:
            collect_triple_owl_lod.add((str(s), str(p), str(o)))

print('# collect triple in LOD: ', len(collect_triple_owl_lod))
collect_extra = collect_triple_owl_lod.difference(collect_triple_owl)
print('# extra: ', len(collect_extra))
for (s, p, o) in collect_extra:
    print('They are:', s, p, o)

#
# for subj, pred, obj in collect_triple_owl:
#     if pred == subClassOf:
#         print (subj, obj)
예제 #18
0
def generate_reduced():
    # Q1 : retrieve the subClassOf relations
    # hdt_file = None
    # output_filename = None
    # output_selfloopClass_filename = None
    # output_leafClass_filename = None
    # output_intermediateClass_filename = None

    # if sys.argv [1] == 'lod':
    hdt_file = HDTDocument(PATH_LOD)
    # output_filename = 'reduced_lod_subClassOf.csv'
    output_selfloopClass_filename = 'lod_reflexive_classes.csv'
    output_leafClass_filename = 'lod_leaf_classes.csv'
    # output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv'
    output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv'
    # else:
    #     hdt_file = HDTDocument(PATH_DBpedia)
    #     output_filename = 'dbpedia_subClassOf.csv'
    #     output_selfloopClass_filename = 'dbpedia_selfloop_classes.csv'
    #     output_leafClass_filename = 'dbpedia_leaf_classes.csv'
    #     output_intermediateClass_filename = 'further_reduced_dbpedia_subClassOf.csv'

    subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate)
    count = 0
    count_selfloop = 0
    count_leaf = 0
    count_left = 0
    count_output_after_further_reduced = 0 # count left of the further reduced
    # removed_leaf_classes = []
    (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "")

    to_explore_ids = set() # to iterate through
    leaf_ids = set()
    removed_intermediate_ids = set() # removed intermediate nodes
    all_ids = set()
    with open(output_filename, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])
        with open(output_intermediateClass_filename, 'w', newline='') as inter_file:
            writer_inter = csv.writer(inter_file)
            writer_inter.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"])

            # Step 1: remove selfloops and leaf nodes
            with open(output_selfloopClass_filename, 'w', newline='') as selfloop_file:
                writer_selfloop = csv.writer(selfloop_file)
                writer_selfloop.writerow([ "ID", "URI"])

                with open(output_leafClass_filename, 'w', newline='') as leaf_file:
                    writer_leaf = csv.writer(leaf_file)
                    writer_leaf.writerow([ "ID", "URI"])


                    for (s, p, o) in subclass_triples:
                        s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
                        o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
                        all_ids.add(s_id)
                        all_ids.add(o_id)
                        count += 1
                        # store it in a csv file
                        if s == o: # self loop
                            count_selfloop += 1
                            writer_selfloop.writerow([s_id, s])
                        else:
                            (_, leaf_cardinality) = hdt_file.search_triples("", subClassOf, s)
                            # test if it is a leaf node
                            if leaf_cardinality == 0:
                                # there is no subclass, this is a leaf node/class
                                # write it to a file and store it
                                writer_leaf.writerow([s_id, s])
                                leaf_ids.add(s_id)
                                count_leaf += 1
                                # removed_leaf_classes.append(s)
                            # else:
                            #     # write what's left to the file
                            #     # SKIP: find intermediate for now
                            #     count_left += 1
                            #     writer.writerow([s_id, s, o_id, o])
            print ('count leaf statements = ', count_leaf)
            print ('count leaf (as set) = ', len (leaf_ids))
            print ('count total statements = ', count)
            print ('count_total nodes (as set) = ', len (all_ids))
            print ('NOW  Part 2: Further Reduce ') # further reduce it

            visited_sup = set()
            # near_leaf_sup = set()
            count_one = 0
            count_loop = 0
            for l_id in leaf_ids:
                count_loop += 1
                (leaf_triples, cardinality) = hdt_file.search_triples_ids(l_id, id_subClassOf, 0)
                # get its superclass id : sup_id
                finished_this_leaf = False
                if cardinality == 1:
                    (l_id, lp_id, sup_id) = leaf_triples.next()
                    (_, sub_cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id)
                    if sub_cardinality == 1:
                        # remove this superclass
                        count_one += 1
                        removed_intermediate_ids.add(sup_id)
                        visited_sup.add(sup_id)
                        (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0)
                        for (sup_id, lp_id, supsup_id) in supsup_triples:
                            to_explore_ids.add(supsup_id)
                        finished_this_leaf = True

                # normal process
                if not finished_this_leaf:
                    for (l_id, lp_id, sup_id) in leaf_triples:
                        if (sup_id not in visited_sup):
                            # lo_id = hdt_file.convert_term(lo, IdentifierPosition.Object)
                            (sup_triples, cardinality_back) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id)
                            supflag = True # if this superclass only has leaf nodes
                            if cardinality_back != 1:
                                for (child_id, lp_id, sup_id) in sup_triples:
                                    if child_id not in leaf_ids:
                                        sup_flag = False
                                        break

                            if supflag:
                                # near_leaf_sup.add(sup_id)
                                removed_intermediate_ids.add(sup_id)
                                (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0)
                                for (sup_id, lp_id, supsup_id) in supsup_triples:
                                    to_explore_ids.add(supsup_id)
                            else:
                                to_explore_ids.add (sup_id)
                            visited_sup.add (sup_id)

                if count_loop %100000 ==0:
                    print ('leaf nodes processed:', count_loop)
                    print ('count one = ', count_one)
                    print ('near-leaf nodes = ', len (removed_intermediate_ids))
                    print ('total visited nodes = ', len (visited_sup))
                    print ('non-near-leaf nodes = ', len(visited_sup) - len(removed_intermediate_ids))
                    print ('to explore = ', len(to_explore_ids))
            print ('*********** after this data processing, we have only ', len(to_explore_ids), ' to explore for the next step')
# # finished data- proprocessing,

            record_to_explore_size = len (to_explore_ids)
            record_iteration = 0
            continue_flag = True
            while (len(to_explore_ids) != 0 and continue_flag):
                # print ('still to explore : ', len(to_explore))
                record_iteration +=1
                # iternate through this and
                n_id = to_explore_ids.pop()
                (triples_id, cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, n_id)
                flag = True
                for (ns_id, np_id, no_id) in triples_id:
                    # if each ns is either a leaf or intermediate but removed, then we remove it.
                    # ns_id = hdt_file.convert_term(ns, IdentifierPosition.Object)
                    if ns_id not in leaf_ids and ns_id not in removed_intermediate_ids:
                        # Keep it for now
                        flag = False
                        break
                if flag == True: # we are sure to remove it

                    removed_intermediate_ids.add (n_id)
                else:
                    to_explore_ids.add (n_id) # add back :(

                if record_iteration == 10000:
                    if record_to_explore_size != len (to_explore_ids):
                        # print ('leaf nodes visited = ', count_leaf)
                        print ('total leaf nodes = ', len(leaf_ids))
                        print ('accummulated removed intermediate = ', len (removed_intermediate_ids))
                        print ('still to explore  = ', len (to_explore_ids))
                        print ('record to explore = ', record_to_explore_size)
                        print ('changed = ', record_to_explore_size - len (to_explore_ids))
                        record_iteration = 0
                        record_to_explore_size = len (to_explore_ids)
                    else:
                        continue_flag = False

            # to write down the intermediate removed

            print ('*****size of leaf:', len (leaf_ids))
            print ('*****size of removed intermediate node :', len (removed_intermediate_ids))
            (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "")
            for (s,p,o) in subclass_triples:
                s_id = hdt_file.convert_term(s, IdentifierPosition.Subject)
                o_id = hdt_file.convert_term(o, IdentifierPosition.Object)
                # count += 1
                # store it in a csv file
                if s != o:
                    # if s is not a leaf node and not a removed intermediate node
                    if (s_id not in leaf_ids) and (s_id not in removed_intermediate_ids):
                        # write what's left to the file
                        count_output_after_further_reduced += 1
                        # print ('count output after further reduced', count_output_after_further_reduced)
                        writer_inter.writerow([s_id, s, o_id, o])
                #     else:
                #         print ('one of them')
                # else:
                #     print ('nothing')

            print ('total entries = ', count)
            print ('total self-loops = ', count_selfloop)
            print ('total leaf nodes/classes = ', count_leaf)
            print ('total left = ', count_left)
            print ('perfectage of reduction: ', count_left/count)
            print ('=====AFTER FURTHER REDUCTION ======')
            print ('There are only ', count_output_after_further_reduced)
            print ('perfectage of reduction: ', count_output_after_further_reduced/count)
예제 #19
0
                "Number of predicates": +document.nb_predicates,
                "Number of objects": +document.nb_objects,
                "Number of shared subject-object": +document.nb_shared
            }

            #Counts
            undeclared_classes_count, declared_classes_count = 0, 0
            declared_properties_count, undeclared_properties_count = 0, 0
            declared_individuals_count, reused_individuals_count, linked_individuals_count = 0, 0, 0
            sameas_link_count, seeAlso_link_count, differentFrom_link_count, allDifferent_link_count = 0, 0, 0, 0
            class_link_count, property_link_count = 0, 0
            instanceTyping_link_count = 0

            # Fetch all declared classes
            (triples, cardinality) = document.search_triples(
                "", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
                "http://www.w3.org/2000/01/rdf-schema#Class")
            for triple in triples:
                if triple[0] not in unique_classes:
                    unique_classes.add(triple[0])
                    declared_classes_count += 1
            (triples, cardinality) = document.search_triples(
                "", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
                "http://www.w3.org/2002/07/owl#Class")
            for triple in triples:
                if triple[0] not in unique_classes:
                    unique_classes.add(triple[0])
                    declared_classes_count += 1

            # Fetch all used classes
            # Fetch all instances of a class
예제 #20
0
# print("query cardinality", cardinality)
# for triple in triples:
#     print(triple)

# Option 1: Extract from mappings
# tp_a = ("?s", "http://swrc.ontoware.org/ontology#url", "?o")
# tp_b = ("?s", "?p", "http://dh2010.cch.kcl.ac.uk/academic-programme/abstracts/papers/pdf/ab-753.pdf")
# iterator = document.search_join([tp_a, tp_b])
# print("estimated join cardinality : %i" % len(iterator))
# for mapping in iterator:
#   print(mapping)

# Option 2: Sequential search for triples
musicians = list()
edge_list = list()
(triples, card) = document.search_triples("", rdf + "type",
                                          dbo + "MusicalArtist")
for triple in triples:
    musicians.append(triple[0])

for musician in musicians:
    (triples, card) = document.search_triples(musician,
                                              dbo + "birthPlace",
                                              dbr + "Karlsruhe",
                                              limit=10)
    for triple in triples:
        edge_list.append((musician, triple[1]))

print(edge_list)

# Construct Graph sequentially, Iterate over edge_list
예제 #21
0
# PATH_LOD = './broader.hdt'

type = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'
subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
dbpediaPerson = 'http://dbpedia.org/ontology/Person'
foafperson = 'http://xmlns.com/foaf/0.1/Person'
purlHasEarlierVersion = 'http://purl.org/pav/hasEarlierVersion'

PATH_LOD = "/scratch/wbeek/data/LOD-a-lot/data.hdt"
hdt = HDTDocument(PATH_LOD)


def get_domain_and_label(t):
    domain = tldextract.extract(t).domain
    name1 = t.rsplit('/', 1)[-1]
    name2 = t.rsplit('#', 1)[-1]
    if len(name1) == 0:
        return (domain, name2)
    if len(name2) == 0:
        return (domain, name1)

    if len(name2) < len(name1):
        return (domain, name2)
    else:
        return (domain, name1)


triples, cardinality = hdt.search_triples("", purlHasEarlierVersion, "")
print('There are ', cardinality, 'purlHasEarlierVersion properties')
예제 #22
0
class EntityLinker(Component, Serializable):
    """
        This class extracts from the knowledge base candidate entities for the entity mentioned in the question and then
        extracts triplets from Wikidata for the extracted entity. Candidate entities are searched in the dictionary where 
        keys are titles and aliases of Wikidata entities and values are lists of tuples (entity_title, entity_id,
        number_of_relations). First candidate entities are searched in the dictionary by keys where the keys are
        entities extracted from the question, if nothing is found entities are searched in the dictionary using
        Levenstein distance between the entity and keys (titles) in the dictionary.
    """

    def __init__(self, load_path: str,
                 inverted_index_filename: str,
                 entities_list_filename: str,
                 q2name_filename: str,
                 save_path: str = None,
                 q2descr_filename: str = None,
                 rel_ranker: RelRankerBertInfer = None,
                 build_inverted_index: bool = False,
                 kb_format: str = "hdt",
                 kb_filename: str = None,
                 label_rel: str = None,
                 descr_rel: str = None,
                 aliases_rels: List[str] = None,
                 sql_table_name: str = None,
                 sql_column_names: List[str] = None,
                 lang: str = "en",
                 use_descriptions: bool = False,
                 lemmatize: bool = False,
                 use_prefix_tree: bool = False,
                 **kwargs) -> None:
        """

        Args:
            load_path: path to folder with inverted index files
            save_path: path where to save inverted index files
            inverted_index_filename: file with dict of words (keys) and entities containing these words
            entities_list_filename: file with the list of entities from the knowledge base
            q2name_filename: name of file which maps entity id to name
            q2descr_filename: name of file which maps entity id to description
            rel_ranker: component deeppavlov.models.kbqa.rel_ranker_bert_infer
            build_inverted_index: if "true", inverted index of entities of the KB will be built
            kb_format: "hdt" or "sqlite3"
            kb_filename: file with the knowledge base, which will be used for building of inverted index
            label_rel: relation in the knowledge base which connects entity ids and entity titles
            descr_rel: relation in the knowledge base which connects entity ids and entity descriptions
            aliases_rels: list of relations which connect entity ids and entity aliases
            sql_table_name: name of the table with the KB if the KB is in sqlite3 format
            sql_column_names: names of columns with subject, relation and object
            lang: language used
            use_descriptions: whether to use context and descriptions of entities for entity ranking
            lemmatize: whether to lemmatize tokens of extracted entity
            use_prefix_tree: whether to use prefix tree for search of entities with typos in entity labels
            **kwargs:
        """
        super().__init__(save_path=save_path, load_path=load_path)
        self.morph = pymorphy2.MorphAnalyzer()
        self.lemmatize = lemmatize
        self.use_prefix_tree = use_prefix_tree
        self.inverted_index_filename = inverted_index_filename
        self.entities_list_filename = entities_list_filename
        self.build_inverted_index = build_inverted_index
        self.q2name_filename = q2name_filename
        self.q2descr_filename = q2descr_filename
        self.kb_format = kb_format
        self.kb_filename = kb_filename
        self.label_rel = label_rel
        self.aliases_rels = aliases_rels
        self.descr_rel = descr_rel
        self.sql_table_name = sql_table_name
        self.sql_column_names = sql_column_names
        self.inverted_index: Optional[Dict[str, List[Tuple[str]]]] = None
        self.entities_index: Optional[List[str]] = None
        self.q2name: Optional[List[Tuple[str]]] = None
        self.lang_str = f"@{lang}"
        if self.lang_str == "@en":
            self.stopwords = set(stopwords.words("english"))
        elif self.lang_str == "@ru":
            self.stopwords = set(stopwords.words("russian"))
        self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.rel_ranker = rel_ranker
        self.use_descriptions = use_descriptions

        if self.use_prefix_tree:
            alphabet = "!#%\&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz½¿ÁÄ" + \
                       "ÅÆÇÉÎÓÖ×ÚßàáâãäåæçèéêëíîïðñòóôöøùúûüýāăąćČčĐėęěĞğĩīİıŁłńňŌōőřŚśşŠšťũūůŵźŻżŽžơưșȚțəʻ" + \
                       "ʿΠΡβγБМавдежикмностъяḤḥṇṬṭầếờợ–‘’Ⅲ−∗"
            dictionary_words = list(self.inverted_index.keys())
            self.searcher = LevenshteinSearcher(alphabet, dictionary_words)

        if self.build_inverted_index:
            if self.kb_format == "hdt":
                self.doc = HDTDocument(str(expand_path(self.kb_filename)))
            if self.kb_format == "sqlite3":
                self.conn = sqlite3.connect(str(expand_path(self.kb_filename)))
                self.cursor = self.conn.cursor()
            self.inverted_index_builder()
            self.save()
        else:
            self.load()

    def load(self) -> None:
        self.inverted_index = load_pickle(self.load_path / self.inverted_index_filename)
        self.entities_list = load_pickle(self.load_path / self.entities_list_filename)
        self.q2name = load_pickle(self.load_path / self.q2name_filename)

    def save(self) -> None:
        save_pickle(self.inverted_index, self.save_path / self.inverted_index_filename)
        save_pickle(self.entities_list, self.save_path / self.entities_list_filename)
        save_pickle(self.q2name, self.save_path / self.q2name_filename)
        if self.q2descr_filename is not None:
            save_pickle(self.q2descr, self.save_path / self.q2descr_filename)

    def __call__(self, entity_substr_batch: List[List[str]], entity_positions_batch: List[List[List[int]]] = None,
                       context_tokens: List[List[str]] = None) -> Tuple[List[List[List[str]]], List[List[List[float]]]]:
        entity_ids_batch = []
        confidences_batch = []
        if entity_positions_batch is None:
            entity_positions_batch = [[[0] for i in range(len(entities_list))] for entities_list in entity_substr_batch]
        for entity_substr_list, entity_positions_list in zip(entity_substr_batch, entity_positions_batch):
            entity_ids_list = []
            confidences_list = []
            for entity_substr, entity_pos in zip(entity_substr_list, entity_positions_list):
                context = ""
                if self.use_descriptions:
                    context = ' '.join(context_tokens[:entity_pos[0]]+["[ENT]"]+context_tokens[entity_pos[-1]+1:])
                entity_ids, confidences = self.link_entity(entity_substr, context)
                entity_ids_list.append(entity_ids)
                confidences_list.append(confidences)
        entity_ids_batch.append(entity_ids_list)
        confidences_batch.append(confidences_list)

        return entity_ids_batch, confidences_batch

    def link_entity(self, entity: str, context: str = None) -> Tuple[List[str], List[float]]:
        confidences = []
        if not entity:
            entities_ids = ['None']
        else:
            candidate_entities = self.candidate_entities_inverted_index(entity)
            candidate_entities, candidate_names = self.candidate_entities_names(entity, candidate_entities)
            entities_ids, confidences, srtd_cand_ent = self.sort_found_entities(candidate_entities,
                                                                                 candidate_names, entity, context)

        return entities_ids, confidences

    def candidate_entities_inverted_index(self, entity: str) -> List[Tuple[Any, Any, Any]]:
        word_tokens = nltk.word_tokenize(entity.lower())
        candidate_entities = []

        for tok in word_tokens:
            if len(tok) > 1:
                found = False
                if tok in self.inverted_index:
                    candidate_entities += self.inverted_index[tok]
                    found = True

                if self.lemmatize:
                    morph_parse_tok = self.morph.parse(tok)[0]
                    lemmatized_tok = morph_parse_tok.normal_form
                    if lemmatized_tok in self.inverted_index:
                        candidate_entities += self.inverted_index[lemmatized_tok]
                        found = True

                if not found and self.use_prefix_tree:
                    words_with_levens_1 = self.searcher.search(tok, d=1)
                    for word in words_with_levens_1:
                        candidate_entities += self.inverted_index[word[0]]
        candidate_entities = list(set(candidate_entities))
        candidate_entities = [(entity[0], self.entities_list[entity[0]], entity[1]) for entity in candidate_entities]

        return candidate_entities

    def sort_found_entities(self, candidate_entities: List[Tuple[int, str, int]],
                            candidate_names: List[List[str]],
                            entity: str, context: str = None) -> Tuple[List[str], List[float], List[Tuple[str, str, int, int]]]:
        entities_ratios = []
        for candidate, entity_names in zip(candidate_entities, candidate_names):
            entity_num, entity_id, num_rels = candidate
            fuzz_ratio = max([fuzz.ratio(name.lower(), entity) for name in entity_names])
            entities_ratios.append((entity_num, entity_id, fuzz_ratio, num_rels))

        srtd_with_ratios = sorted(entities_ratios, key=lambda x: (x[2], x[3]), reverse=True)
        if self.use_descriptions:
            num_to_id = {entity_num: entity_id for entity_num, entity_id, _, _ in srtd_with_ratios[:30]}
            entity_numbers = [entity_num for entity_num, _, _, _ in srtd_with_ratios[:30]]
            scores = self.rel_ranker.rank_rels(context, entity_numbers)
            top_rels = [score[0] for score in scores]
            entity_ids = [num_to_id[num] for num in top_rels]
            confidences = [score[1] for score in scores]
        else:
            entity_ids = [ent[1] for ent in srtd_with_ratios]
            confidences = [float(ent[2]) * 0.01 for ent in srtd_with_ratios]

        return entity_ids, confidences, srtd_with_ratios

    def candidate_entities_names(self, entity: str,
          candidate_entities: List[Tuple[int, str, int]]) -> Tuple[List[Tuple[int, str, int]], List[List[str]]]:
        entity_length = len(entity)
        candidate_names = []
        candidate_entities_filter = []
        for candidate in candidate_entities:
            entity_num = candidate[0]
            entity_id = candidate[1]
            entity_names = []
            
            entity_names_found = self.q2name[entity_num]
            if len(entity_names_found[0]) < 6 * entity_length:
                entity_name = entity_names_found[0]
                entity_names.append(entity_name)
                if len(entity_names_found) > 1:
                    for alias in entity_names_found[1:]:
                        entity_names.append(alias)
                candidate_names.append(entity_names)
                candidate_entities_filter.append(candidate)

        return candidate_entities_filter, candidate_names

    def inverted_index_builder(self) -> None:
        log.debug("building inverted index")
        entities_set = set()
        id_to_label_dict = defaultdict(list)
        id_to_descr_dict = {}
        label_to_id_dict = {}
        label_triplets = []
        alias_triplets_list = []
        descr_triplets = []
        if self.kb_format == "hdt":
            label_triplets, c = self.doc.search_triples("", self.label_rel, "")
            if self.aliases_rels is not None:
                for alias_rel in self.aliases_rels:
                    alias_triplets, c = self.doc.search_triples("", alias_rel, "")
                    alias_triplets_list.append(alias_triplets)
            if self.descr_rel is not None:
                descr_triplets, c = self.doc.search_triples("", self.descr_rel, "")

        if self.kb_format == "sqlite3":
            subject, relation, obj = self.sql_column_names
            query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} WHERE {relation} = "{self.label_rel}";'
            res = self.cursor.execute(query)
            label_triplets = res.fetchall()
            if self.aliases_rels is not None:
                for alias_rel in self.aliases_rels:
                    query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} WHERE {relation} = "{alias_rel}";'
                    res = self.cursor.execute(query)
                    alias_triplets = res.fetchall()
                    alias_triplets_list.append(alias_triplets)
            if self.descr_rel is not None:
                query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} WHERE {relation} = "{self.descr_rel}";'
                res = self.cursor.execute(query)
                descr_triplets = res.fetchall()

        for triplets in [label_triplets] + alias_triplets_list:
            for triplet in triplets:
                entities_set.add(triplet[0])
                if triplet[2].endswith(self.lang_str):
                    label = triplet[2].replace(self.lang_str, '').replace('"', '')
                    id_to_label_dict[triplet[0]].append(label)
                    label_to_id_dict[label] = triplet[0]

        for triplet in descr_triplets:
            entities_set.add(triplet[0])
            if triplet[2].endswith(self.lang_str):
                descr = triplet[2].replace(self.lang_str, '').replace('"', '')
                id_to_descr_dict[triplet[0]].append(descr)

        popularities_dict = {}
        for entity in entities_set:
            if self.kb_format == "hdt":
                all_triplets, number_of_triplets = self.doc.search_triples(entity, "", "")
                popularities_dict[entity] = number_of_triplets
            if self.kb_format == "sqlite3":
                subject, relation, obj = self.sql_column_names
                query = f'SELECT COUNT({obj}) FROM {self.sql_table_name} WHERE {subject} = "{entity}";'
                res = self.cursor.execute(query)
                popularities_dict[entity] = res.fetchall()[0][0]

        entities_dict = {entity: n for n, entity in enumerate(entities_set)}
            
        inverted_index = defaultdict(list)
        for label in label_to_id_dict:
            tokens = re.findall(self.re_tokenizer, label.lower())
            for tok in tokens:
                if len(tok) > 1 and tok not in self.stopwords:
                    inverted_index[tok].append((entities_dict[label_to_id_dict[label]],
                                                popularities_dict[label_to_id_dict[label]]))
        self.inverted_index = dict(inverted_index)
        self.entities_list = list(entities_set)
        self.q2name = [id_to_label_dict[entity] for entity in self.entities_list]
        self.q2descr = []
        if id_to_descr_dict:
            self.q2descr = [id_to_descr_dict[entity] for entity in self.entities_list]
예제 #23
0
# for (s, p, o) in triples:
#     if s[0] == '"':
#         s = s
#     else:
#         s = '<' + s + '>'
#     if o[0] == '"':
#         o = o
#     else:
#         o = '<' + o + '>'
#     p = '<' + p + '>'
#     file_broader.write(s +' '+  p+  ' ' + o + '.\n' )
#     # narrower_ =  '<' + narrower + '>'
#     # file_integrated.write(o +' '+  narrower +  ' ' + s + '.\n' )
#

triples, cardinality = hdt.search_triples("", narrowerTransitive, "")
print('There are ', cardinality, 'narrowerTransitive properties')

for (s, p, o) in triples:
    if s[0] == '"':
        s = s
    else:
        s = '<' + s + '>'
    if o[0] == '"':
        o = o
    else:
        o = '<' + o + '>'
    p = '<' + p + '>'
    file_narrower.write(s + ' ' + p + ' ' + o + '.\n')
    # file_integrated.write(s +' '+  p+  ' ' + o + '.\n' )
예제 #24
0
class WikiParser:
    """This class extract relations, objects or triplets from Wikidata HDT file"""
    def __init__(self,
                 wiki_filename: str,
                 lang: str = "@en",
                 **kwargs) -> None:
        """

        Args:
            wiki_filename: hdt file with wikidata
            lang: Russian or English language
            **kwargs:
        """
        log.debug(f'__init__ wiki_filename: {wiki_filename}')
        wiki_path = expand_path(wiki_filename)
        self.description_rel = "http://schema.org/description"
        self.lang = lang
        self.document = HDTDocument(str(wiki_path))

    def __call__(self, what_return: List[str], query_seq: List[List[str]],
                 filter_info: List[Tuple[str]],
                 order_info: namedtuple) -> List[List[str]]:
        """
            Let us consider an example of the question 
                "What is the deepest lake in Russia?"
            with the corresponding SPARQL query            
            "SELECT ?ent WHERE { ?ent wdt:P31 wd:T1 . ?ent wdt:R1 ?obj . ?ent wdt:R2 wd:E1 } ORDER BY ASC(?obj) LIMIT 5"
            arguments:
                what_return: ["?obj"]
                query_seq: [["?ent", "http://www.wikidata.org/prop/direct/P17", "http://www.wikidata.org/entity/Q159"]
                            ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"],
                            ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"]]
                filter_info: []
                order_info: order_info(variable='?obj', sorting_order='asc')
        """
        extended_combs = []
        combs = []
        for n, query in enumerate(query_seq):
            unknown_elem_positions = [(pos, elem)
                                      for pos, elem in enumerate(query)
                                      if elem.startswith('?')]
            """
                n = 0, query = ["?ent", "http://www.wikidata.org/prop/direct/P17", "http://www.wikidata.org/entity/Q159"]
                       unknown_elem_positions = ["?ent"]
                n = 1, query = ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"]
                       unknown_elem_positions = [(0, "?ent")]
                n = 2, query = ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"]
                       unknown_elem_positions = [(0, "?ent"), (2, "?obj")]
            """
            if n == 0:
                combs = self.search(query, unknown_elem_positions)
                # combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...]
            else:
                if combs:
                    known_elements = []
                    extended_combs = []
                    for elem in query:
                        if elem in combs[0].keys():
                            known_elements.append(elem)
                    for comb in combs:
                        """
                            n = 1
                            query = ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"]
                            comb = {"?ent": "http://www.wikidata.org/entity/Q5513"}
                            known_elements = ["?ent"], known_values = ["http://www.wikidata.org/entity/Q5513"]
                            filled_query = ["http://www.wikidata.org/entity/Q5513", 
                                            "http://www.wikidata.org/prop/direct/P31", 
                                            "http://www.wikidata.org/entity/Q23397"]
                            new_combs = [["http://www.wikidata.org/entity/Q5513", 
                                          "http://www.wikidata.org/prop/direct/P31", 
                                          "http://www.wikidata.org/entity/Q23397"], ...]
                            extended_combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...]
                        """
                        known_values = [
                            comb[known_elem] for known_elem in known_elements
                        ]
                        for known_elem, known_value in zip(
                                known_elements, known_values):
                            filled_query = [
                                elem.replace(known_elem, known_value)
                                for elem in query
                            ]
                            new_combs = self.search(filled_query,
                                                    unknown_elem_positions)
                            for new_comb in new_combs:
                                extended_combs.append({**comb, **new_comb})
                combs = extended_combs

        if combs:
            if filter_info:
                for filter_elem, filter_value in filter_info:
                    combs = [
                        comb for comb in combs
                        if filter_value in comb[filter_elem]
                    ]

            if order_info.variable is not None:
                reverse = True if order_info.sorting_order == "desc" else False
                sort_elem = order_info.variable
                combs = sorted(combs,
                               key=lambda x: float(x[sort_elem].split('^^')[0].
                                                   strip('"')),
                               reverse=reverse)
                combs = [combs[0]]

            if what_return[-1].startswith("count"):
                combs = [[combs[0][key]
                          for key in what_return[:-1]] + [len(combs)]]
            else:
                combs = [[elem[key] for key in what_return] for elem in combs]

        return combs

    def search(
            self, query: List[str],
            unknown_elem_positions: List[Tuple[int,
                                               str]]) -> List[Dict[str, str]]:
        query = list(
            map(lambda elem: "" if elem.startswith('?') else elem, query))
        subj, rel, obj = query
        triplets, c = self.document.search_triples(subj, rel, obj)
        if rel == self.description_rel:
            triplets = [
                triplet for triplet in triplets
                if triplet[2].endswith(self.lang)
            ]
        combs = [{elem: triplet[pos]
                  for pos, elem in unknown_elem_positions}
                 for triplet in triplets]
        return combs

    def find_label(self, entity: str) -> str:
        entity = str(entity).replace('"', '')
        if entity.startswith("Q"):
            # example: "Q5513"
            entity = "http://www.wikidata.org/entity/" + entity
            # "http://www.wikidata.org/entity/Q5513"

        if entity.startswith("http://www.wikidata.org/entity/"):
            labels, cardinality = self.document.search_triples(
                entity, "http://www.w3.org/2000/01/rdf-schema#label", "")
            # labels = [["http://www.wikidata.org/entity/Q5513", "http://www.w3.org/2000/01/rdf-schema#label", '"Lake Baikal"@en'], ...]
            for label in labels:
                if label[2].endswith(self.lang):
                    found_label = label[2].strip(self.lang).replace('"', '')
                    return found_label

        elif entity.endswith(self.lang):
            # entity: '"Lake Baikal"@en'
            entity = entity.strip(self.lang)
            return entity

        elif "^^" in entity:
            """
                examples:
                    '"1799-06-06T00:00:00Z"^^<http://www.w3.org/2001/XMLSchema#dateTime>' (date)
                    '"+1642"^^<http://www.w3.org/2001/XMLSchema#decimal>' (number)
            """
            entity = entity.split("^^")[0]
            for token in ["T00:00:00Z", "+"]:
                entity = entity.replace(token, '')
            return entity

        elif entity.isdigit():
            return entity

        return "Not Found"

    def find_alias(self, entity: str) -> List[str]:
        aliases = []
        if entity.startswith("http://www.wikidata.org/entity/"):
            labels, cardinality = self.document.search_triples(
                entity, "http://www.w3.org/2004/02/skos/core#altLabel", "")
            aliases = [
                label[2].strip(self.lang).strip('"') for label in labels
                if label[2].endswith(self.lang)
            ]
        return aliases

    def find_rels(self,
                  entity: str,
                  direction: str,
                  rel_type: str = "no_type") -> List[str]:
        if direction == "forw":
            triplets, num = self.document.search_triples(
                f"http://www.wikidata.org/entity/{entity}", "", "")
        else:
            triplets, num = self.document.search_triples(
                "", "", f"http://www.wikidata.org/entity/{entity}")

        if rel_type != "no_type":
            start_str = f"http://www.wikidata.org/prop/{rel_type}"
        else:
            start_str = "http://www.wikidata.org/prop/P"
        rels = [
            triplet[1] for triplet in triplets
            if triplet[1].startswith(start_str)
        ]
        return rels
#     if s[0] == '"':
#         s = s
#     else:
#         s = '<' + s + '>'
#     if o[0] == '"':
#         o = o
#     else:
#         o = '<' + o + '>'
#     p = '<' + p + '>'
#     file_narrower.write(s +' '+  p+  ' ' + o + '.\n' )
#     # file_integrated.write(s +' '+  p+  ' ' + o + '.\n' )
#
# # file.close()
# narrowerGraph.serialize(destination='narrower.nt', format='nt')
#
triples, cardinality = hdt.search_triples("", broader, "")
print('There are ', cardinality, 'isPartOf properties')
for (s, p, o) in triples:
    if s[0] == '"':
        if s.count('"') > 2 or '%' in s or '\'' in s or '\\' in s:
            continue
        s = s
    else:
        s = '<' + s + '>'
    if o[0] == '"':
        if o.count('"') > 2 or '%' in o or '\'' in o or '\\' in o:
            continue
        o = o
    else:
        o = '<' + o + '>'
    p = '<' + p + '>'
    IdentifierPosition.Predicate)
id_sameAs = hdt_lod.convert_term("http://www.w3.org/2002/07/owl#sameAs",
                                 IdentifierPosition.Predicate)
id_subClassOf = hdt_lod.convert_term(
    "http://www.w3.org/2000/01/rdf-schema#subClassOf",
    IdentifierPosition.Predicate)
id_equivalentClass = hdt_lod.convert_term(
    "http://www.w3.org/2002/07/owl#equivalentClass",
    IdentifierPosition.Predicate)

# output some stats of LOD-a-lot
# we can query the HDT file using the term IDs (e.g. rdf:type and equivalentClass) or the URIs (e.g. subClassOf and sameAs)
print("# subjects:", "{:,}".format(hdt_lod.nb_subjects))
print("# predicates:", "{:,}".format(hdt_lod.nb_predicates))
print("# objects:", "{:,}".format(hdt_lod.nb_objects))
(triples, cardinality) = hdt_lod.search_triples("", "", "")
print("# triples:", "{:,}".format(cardinality))
(triples, cardinality) = hdt_lod.search_triples_ids(0, id_type, 0)
print("# rdf:type statements:", "{:,}".format(cardinality))
(triples, cardinality) = hdt_lod.search_triples(
    "", "http://www.w3.org/2000/01/rdf-schema#subClassOf", "")
print("# rdfs:subClassOf statements:", "{:,}".format(cardinality))
(triples, cardinality) = hdt_lod.search_triples_ids(0, id_equivalentClass, 0)
print("# owl:equivalentClass statements:", "{:,}".format(cardinality))
(triples,
 cardinality) = hdt_lod.search_triples("",
                                       "http://www.w3.org/2002/07/owl#sameAs",
                                       "")
print("# owl:sameAs statements:", "{:,}".format(cardinality))

예제 #27
0
class HDT_Dataset():
    def __init__(self, **kwargs):

        self.__source = kwargs.get("file", None)
        if self.__source is None:
            raise TypeError()
        try:
            self.document = HDTDocument(self.__source)
        except Exception as e:
            logger.exception("Could not load HDT File from {}.".format(
                self.__source))
            raise e
        self.card = None

    def __str__(self):
        return str(self.__source)

    def __len__(self):
        if self.card:
            return self.card
        else:
            (_, self.card) = self.document.search_triples("", "", "")
            return len(self)

    @property
    def distinct_subjects(self):
        return self.document.nb_subjects

    def random_subjects(self, size=100, weighted=True):

        logger.info(f"Generating a random sample, weighted = {weighted}")
        cardinality = len(self)
        sample = set()
        if size > len(self):
            raise Exception("Sample size exceeds dataset size")
        while (len(sample) < size):
            offset = random.randint(0, cardinality - 1)
            (triples, res_card) = self.document.search_triples("",
                                                               "",
                                                               "",
                                                               limit=1,
                                                               offset=offset)
            subject = tuple_to_triple(next(triples))[0]
            if weighted:
                if type(subject) == URIRef:
                    sample.add(subject)
            else:
                # Get the degree of the subject
                (ts, subject_degree) = self.document.search_triples(
                    subject, "", "")
                # Assume the minimum degree of all subjects = 1
                min_degree = 1
                min_probability = min_degree / self.document.nb_subjects  # Minimum probaility of a subject to be chosen

                # Probability of the current subject to be chosen
                p = min_probability / (subject_degree /
                                       self.document.nb_subjects)

                # Draw random number
                r = random.random()

                if r < p:
                    if type(subject) == URIRef:
                        sample.add(subject)
        return sample

    def random_sample(self, size=100):

        cardinality = len(self)
        sample = set()
        if size > len(self):
            raise Exception("Sample size exceeds dataset size")
        while len(sample) < size:
            offset = random.randint(0, cardinality - 1)
            (triples, res_card) = self.document.search_triples("",
                                                               "",
                                                               "",
                                                               limit=1,
                                                               offset=offset)
            sample.add(tuple_to_triple(next(triples)))

        return list(sample)

    def outgoing_edges(self, terms, **kwargs):
        file = kwargs.get("file", None)
        total_cardinality = 0
        for term in terms:
            (triples,
             cardinality) = self.document.search_triples(str(term), "", "")
            for triple in triples:
                file.write(tuple_to_ntriple(triple))
            total_cardinality += cardinality
        return total_cardinality

    def random_edge(self, subject):

        (triples,
         cardinality) = self.document.search_triples(str(subject), "", "")
        if cardinality == 0:
            return None
        random_offset = random.randint(0, cardinality)
        (triples,
         cardinality) = self.document.search_triples(str(subject),
                                                     "",
                                                     "",
                                                     limit=1,
                                                     offset=random_offset)
        for triple in triples:
            object = tuple_to_triple(triple)[2]
            if type(object) is URIRef:
                return object
        return None
예제 #28
0
class SubP:

    # Initializer / Instance Attributes
    def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ):
        self.hdt = HDTDocument(path_hdt)

        self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
        self.id_subClassOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subClassOf",
            IdentifierPosition.Predicate)

        self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass"
        self.id_equivalentClass = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentClass",
            IdentifierPosition.Predicate)

        self.subPropertyOf = "http://www.w3.org/2000/01/rdf-schema#subPropertyOf"
        self.id_subPropertyOf = self.hdt.convert_term(
            "http://www.w3.org/2000/01/rdf-schema#subPropertyOf",
            IdentifierPosition.Predicate)

        self.equivalentProperty = "http://www.w3.org/2002/07/owl#equivalentProperty"
        self.id_equivalentProperty = self.hdt.convert_term(
            "http://www.w3.org/2002/07/owl#equivalentProperty",
            IdentifierPosition.Predicate)

        self.graph = nx.DiGraph()

        self.equi_graph_manager = None  #equiClassManager(path_eq)
        print('set up the equivalence class manager')
        self.diagnosed_relations = []  # the result
        self.suggestion_on_relations = [
        ]  # from the manual decison and Joe's sameAs data. Triple
        self.leaf_classes = set()

        print('finished initialization')

    def setup_graph(self):
        print('set up the graph')
        (subclass_triple_ids,
         cardinality) = self.enquiry(query=(0, self.id_subPropertyOf, 0),
                                     mode="default")
        collect_pairs = []
        for (s_id, _, o_id) in subclass_triple_ids:
            # add to the directed graph
            collect_pairs.append((s_id, o_id))

        print('there are ', len(collect_pairs), 'edges')
        self.graph.add_edges_from(collect_pairs)

    def convert_to_id(self, term):
        if term == "akt742:Intangible-Thing":
            # this is the only class that has two different ids (as subject and object)
            return 2601100675
        else:
            return self.hdt.convert_term(term, IdentifierPosition.Subject)

    def convert_to_term(self, id):
        if id == 2601100675:
            return "akt742:Intangible-Thing"
            # this is the only one that has two different ids (as subject and object)
        else:
            return self.hdt.convert_id(id, IdentifierPosition.Subject)

    def enquiry(self, query, mode="subp"):
        (s, p, o) = query
        if mode == "default":
            return self.hdt.search_triples_ids(s, p, o)
        else:
            # examine the filtered part first
            pass

    def print_info(self, sbj, obj):
        predicate_names = [
            "http://sw.cyc.com/CycAnnotations_v1#label",
            "http://www.w3.org/2000/01/rdf-schema#comment",
            "http://www.w3.org/2000/01/rdf-schema#label"
        ]

        s_domain = tldextract.extract(sbj).domain
        o_domain = tldextract.extract(obj).domain
        # filter that domain
        # if (s_domain != DOMAIN and o_domain != DOMAIN):
        #     # print (DOMAIN)
        print('SUBJECT: ', sbj)
        for p in predicate_names:
            (triples, cardinality) = self.hdt.search_triples(sbj, p, "")
            for (s, p, o) in triples:
                print('\tPREDICATE: ', p)
                print('\t\t Comments/labels  :', o, '\n')
        print('OBJECT: ', obj)
        for p in predicate_names:
            (triples, cardinality) = self.hdt.search_triples(obj, p, "")
            for (s, p, o) in triples:
                print('\tPREDICATE: ', p)
                print('\t\t Comments/labels  :', o, '\n')

        print('\n\n========================\n\n')

    def export_cycle(self):
        simp_c = list(nx.simple_cycles(self.graph))
        print('find simple cycle in graph')
        print('there are ', len(simp_c), ' simple cycles')

        count1 = 0
        count_others = 0
        count_sameas = 0
        count_eqProp = 0
        count_bigger = 0

        collect_self_loop = []
        collect_eq = []
        collect_others = []
        collect_bigger = []
        for c in simp_c:
            if len(c) == 1:
                count1 += 1
                collect_self_loop.append(c)
            elif len(c) == 2:
                # print (c)
                # for n in c:
                #     t = self.convert_to_term(n)
                #     print ('\t', t)
                # print ('\n')

                l_term = self.convert_to_term(c[0])
                r_term = self.convert_to_term(c[1])

                # id_equivalentProperty
                (subclass_triple_ids, cardinality) = self.enquiry(
                    query=(c[0], self.id_equivalentProperty, c[1]),
                    mode="default")

                # if (self.equi_graph_manager.test_equivalent(l_term, r_term)):
                #     print ('There is a owl:sameAs relation in between')
                #     count_sameas += 1
                #     collect_eq.append(c)

                if (cardinality > 0):
                    print('There is a owl:equivalentProperty in between')
                    count_eqProp += 1
                    collect_eq.append(c)

                else:
                    # self.print_info(c[0], l_term, c[1], r_term)
                    # print ('a longer one for manual decision:',c )
                    # count_others += 1
                    collect_others.append(c)
                count_others += 1
            else:
                count_bigger += 1
                collect_bigger.append((c[0], c[1]))
                collect_bigger.append((c[1], c[2]))
                collect_bigger.append((c[2], c[0]))

        print('there are ', count1, ' reflexive cycles')

        print('there are ', count_sameas, ' sameAs relations')
        print('there are ', count_eqProp, ' eqProp relations')
        print('there are ', count_others, ' size-two cycles')
        print('there are ', count_bigger, ' bigger cycles')
        # export self-loop cycles:

        with open(file_name, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([
                "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION",
                "DECISION"
            ])
            # write to file
            # print ('collect self loop: ',collect_self_loop)
            for [s_id] in collect_self_loop:
                # convert
                s_term = self.convert_to_term(s_id)
                o_term = s_term
                writer.writerow([s_id, s_term, s_id, o_term, 'remove',
                                 'o'])  # removed from automatic method
            for (s_id, o_id) in collect_eq:
                # convert
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                writer.writerow([s_id, s_term, o_id, o_term, 'remove',
                                 'e'])  # removed from automatic method

            for (s_id, o_id) in collect_others:
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                self.print_info(s_term, o_term)
                writer.writerow([s_id, s_term, o_id, o_term, 'remove',
                                 '2'])  # removed from manual step
                writer.writerow([o_id, o_term, s_id, s_term, 'remove',
                                 '2'])  # removed from manual step

            for (s_id, o_id) in collect_bigger:
                s_term = self.convert_to_term(s_id)
                o_term = self.convert_to_term(o_id)
                # print ('===a longer cycle ===', c)

                writer.writerow([s_id, s_term, o_id, o_term, 'remove',
                                 'x'])  # removed from manual step

    def load_removed(self):
        # 'pre-subP.csv'
        subp_file = open('pre-subP.csv', 'r')
        reader = csv.DictReader(subp_file)
        coll_removed = []
        for row in reader:
            s_id = int(row["SUBJECT_ID"])
            # s = row["SUBJECT"]
            o_id = int(row["OBJECT_ID"])
            sug = row["SUGGESTION"]  # should be remove

            if (sug == 'remove'):
                coll_removed.append((s_id, o_id))
        print('number of removed edges:', len(coll_removed))
        self.graph.remove_edges_from(coll_removed)

    def test_cycle(self):
        try:
            c = nx.find_cycle(self.graph)  # change to simple_cycles ??
            print('cycle = ', c)

        except Exception as e:
            # hint_not_working = True
            print('no cycle')

    def export_graph_nt(self, name):
        g = Graph()
        for (s_id, o_id) in self.graph.edges:
            s_term = self.convert_to_term(s_id)
            o_term = self.convert_to_term(o_id)
            bob = URIRef("http://www.w3.org/2000/01/rdf-schema#subPropertyOf")
            g.add((URIRef(s_term), bob, URIRef(o_term)))

        # print("--- printing raw triples ---")
        # for s, p, o in g:
        #     print((s, p, o))

        g.serialize(destination=name, format='nt')
예제 #29
0
import csv
import urllib

from hdt import HDTDocument
import pandas as pd

from constants import SEP
from settings import (HDT_FILE, DATASET_FILE, OUTPUT_DATASET_FILE, STATS_FILE,
                      PREDICATES_EXCLUDED, QUERY, RATIO)
from functions import get_sujeto_atr, get_predicado_atr, get_objeto_atr

# HDTDocument creation
document = HDTDocument(HDT_FILE)

# Se hace la consulta de los triples en funcion del sujeto/predicado/objeto
(triples, cardinality) = document.search_triples("", "", QUERY)


def query(query):
    print("{}: {} objetos.".format(query, cardinality))

    #%% Procesamiento
    # triple = s p o
    lista_objetos = []
    for triple in triples:
        s, p, o = triple
        sujeto_descripcion, sujeto_URI = get_sujeto_atr(s)
        lista_objetos.append(sujeto_URI[1:-1])

    numero = 0
예제 #30
0
class KBEntityLinker(Component, Serializable):
    """
        This class extracts from the knowledge base candidate entities for the entity mentioned in the question and then
        extracts triplets from Wikidata for the extracted entity. Candidate entities are searched in the dictionary
        where keys are titles and aliases of Wikidata entities and values are lists of tuples (entity_title, entity_id,
        number_of_relations). First candidate entities are searched in the dictionary by keys where the keys are
        entities extracted from the question, if nothing is found entities are searched in the dictionary using
        Levenstein distance between the entity and keys (titles) in the dictionary.
    """
    def __init__(self,
                 load_path: str,
                 inverted_index_filename: str,
                 entities_list_filename: str,
                 q2name_filename: str,
                 who_entities_filename: Optional[str] = None,
                 save_path: str = None,
                 q2descr_filename: str = None,
                 descr_rank_score_thres: float = 0.0,
                 freq_dict_filename: Optional[str] = None,
                 entity_ranker: RelRankerBertInfer = None,
                 build_inverted_index: bool = False,
                 kb_format: str = "hdt",
                 kb_filename: str = None,
                 label_rel: str = None,
                 descr_rel: str = None,
                 aliases_rels: List[str] = None,
                 sql_table_name: str = None,
                 sql_column_names: List[str] = None,
                 lang: str = "en",
                 use_descriptions: bool = False,
                 include_mention: bool = False,
                 lemmatize: bool = False,
                 use_prefix_tree: bool = False,
                 **kwargs) -> None:
        """

        Args:
            load_path: path to folder with inverted index files
            inverted_index_filename: file with dict of words (keys) and entities containing these words
            entities_list_filename: file with the list of entities from the knowledge base
            q2name_filename: name of file which maps entity id to name
            who_entities_filename: file with the list of entities in Wikidata, which can be answers to questions
                with "Who" pronoun, i.e. humans, literary characters etc.
            save_path: path where to save inverted index files
            q2descr_filename: name of file which maps entity id to description
            descr_rank_score_thres: if the score of the entity description is less than threshold, the entity is not
                added to output list
            freq_dict_filename: filename with frequences dictionary of Russian words
            entity_ranker: component deeppavlov.models.kbqa.rel_ranker_bert_infer
            build_inverted_index: if "true", inverted index of entities of the KB will be built
            kb_format: "hdt" or "sqlite3"
            kb_filename: file with the knowledge base, which will be used for building of inverted index
            label_rel: relation in the knowledge base which connects entity ids and entity titles
            descr_rel: relation in the knowledge base which connects entity ids and entity descriptions
            aliases_rels: list of relations which connect entity ids and entity aliases
            sql_table_name: name of the table with the KB if the KB is in sqlite3 format
            sql_column_names: names of columns with subject, relation and object
            lang: language used
            use_descriptions: whether to use context and descriptions of entities for entity ranking
            include_mention: whether to leave or delete entity mention from the sentence before passing to BERT ranker
            lemmatize: whether to lemmatize tokens of extracted entity
            use_prefix_tree: whether to use prefix tree for search of entities with typos in entity labels
            **kwargs:
        """
        super().__init__(save_path=save_path, load_path=load_path)
        self.morph = pymorphy2.MorphAnalyzer()
        self.lemmatize = lemmatize
        self.use_prefix_tree = use_prefix_tree
        self.inverted_index_filename = inverted_index_filename
        self.entities_list_filename = entities_list_filename
        self.build_inverted_index = build_inverted_index
        self.q2name_filename = q2name_filename
        self.who_entities_filename = who_entities_filename
        self.q2descr_filename = q2descr_filename
        self.descr_rank_score_thres = descr_rank_score_thres
        self.freq_dict_filename = freq_dict_filename
        self.kb_format = kb_format
        self.kb_filename = kb_filename
        self.label_rel = label_rel
        self.aliases_rels = aliases_rels
        self.descr_rel = descr_rel
        self.sql_table_name = sql_table_name
        self.sql_column_names = sql_column_names
        self.inverted_index: Optional[Dict[str, List[Tuple[str]]]] = None
        self.entities_index: Optional[List[str]] = None
        self.q2name: Optional[List[Tuple[str]]] = None
        self.lang_str = f"@{lang}"
        if self.lang_str == "@en":
            self.stopwords = set(stopwords.words("english"))
        elif self.lang_str == "@ru":
            self.stopwords = set(stopwords.words("russian"))
        self.re_tokenizer = re.compile(r"[\w']+|[^\w ]")
        self.entity_ranker = entity_ranker
        self.use_descriptions = use_descriptions
        self.include_mention = include_mention
        if self.use_descriptions and self.entity_ranker is None:
            raise ValueError("No entity ranker is provided!")

        if self.use_prefix_tree:
            alphabet = "!#%\&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz½¿ÁÄ" + \
                       "ÅÆÇÉÎÓÖ×ÚßàáâãäåæçèéêëíîïðñòóôöøùúûüýāăąćČčĐėęěĞğĩīİıŁłńňŌōőřŚśşŠšťũūůŵźŻżŽžơưșȚțəʻ" + \
                       "ʿΠΡβγБМавдежикмностъяḤḥṇṬṭầếờợ–‘’Ⅲ−∗"
            dictionary_words = list(self.inverted_index.keys())
            self.searcher = LevenshteinSearcher(alphabet, dictionary_words)

        if self.build_inverted_index:
            if self.kb_format == "hdt":
                self.doc = HDTDocument(str(expand_path(self.kb_filename)))
            elif self.kb_format == "sqlite3":
                self.conn = sqlite3.connect(str(expand_path(self.kb_filename)))
                self.cursor = self.conn.cursor()
            else:
                raise ValueError(
                    f'unsupported kb_format value {self.kb_format}')
            self.inverted_index_builder()
            self.save()
        else:
            self.load()

    def load_freq_dict(self, freq_dict_filename: str):
        with open(str(expand_path(freq_dict_filename)), 'r') as fl:
            lines = fl.readlines()
        pos_freq_dict = defaultdict(list)
        for line in lines:
            line_split = line.strip('\n').split('\t')
            if re.match("[\d]+\.[\d]+", line_split[2]):
                pos_freq_dict[line_split[1]].append(
                    (line_split[0], float(line_split[2])))
        nouns_with_freq = pos_freq_dict["s"]
        self.nouns_dict = {noun: freq for noun, freq in nouns_with_freq}

    def load(self) -> None:
        self.inverted_index = load_pickle(self.load_path /
                                          self.inverted_index_filename)
        self.entities_list = load_pickle(self.load_path /
                                         self.entities_list_filename)
        self.q2name = load_pickle(self.load_path / self.q2name_filename)
        if self.who_entities_filename:
            self.who_entities = load_pickle(self.load_path /
                                            self.who_entities_filename)
        if self.freq_dict_filename:
            self.load_freq_dict(self.freq_dict_filename)

    def save(self) -> None:
        save_pickle(self.inverted_index,
                    self.save_path / self.inverted_index_filename)
        save_pickle(self.entities_list,
                    self.save_path / self.entities_list_filename)
        save_pickle(self.q2name, self.save_path / self.q2name_filename)
        if self.q2descr_filename is not None:
            save_pickle(self.q2descr, self.save_path / self.q2descr_filename)

    def __call__(
        self,
        entity_substr_batch: List[List[str]],
        entity_positions_batch: List[List[List[int]]] = None,
        context_tokens: List[List[str]] = None
    ) -> Tuple[List[List[List[str]]], List[List[List[float]]]]:
        entity_ids_batch = []
        confidences_batch = []
        if entity_positions_batch is None:
            entity_positions_batch = [[[0] for i in range(len(entities_list))]
                                      for entities_list in entity_substr_batch]
        for entity_substr_list, entity_positions_list in zip(
                entity_substr_batch, entity_positions_batch):
            entity_ids_list = []
            confidences_list = []
            for entity_substr, entity_pos in zip(entity_substr_list,
                                                 entity_positions_list):
                context = ""
                if self.use_descriptions:
                    if self.include_mention:
                        context = ' '.join(
                            context_tokens[:entity_pos[0]] + ["[ENT]"] +
                            context_tokens[entity_pos[0]:entity_pos[-1] + 1] +
                            ["[ENT]"] + context_tokens[entity_pos[-1] + 1:])
                    else:
                        context = ' '.join(context_tokens[:entity_pos[0]] +
                                           ["[ENT]"] +
                                           context_tokens[entity_pos[-1] + 1:])
                entity_ids, confidences = self.link_entity(
                    entity_substr, context)
                entity_ids_list.append(entity_ids)
                confidences_list.append(confidences)
        entity_ids_batch.append(entity_ids_list)
        confidences_batch.append(confidences_list)

        return entity_ids_batch, confidences_batch

    def link_entity(self,
                    entity: str,
                    context: Optional[str] = None,
                    template_found: Optional[str] = None,
                    cut_entity: bool = False) -> Tuple[List[str], List[float]]:
        confidences = []
        if not entity:
            entities_ids = ['None']
        else:
            candidate_entities = self.candidate_entities_inverted_index(entity)
            if cut_entity and candidate_entities and len(
                    entity.split()) > 1 and candidate_entities[0][3] == 1:
                entity = self.cut_entity_substr(entity)
                candidate_entities = self.candidate_entities_inverted_index(
                    entity)
            candidate_entities, candidate_names = self.candidate_entities_names(
                entity, candidate_entities)
            entities_ids, confidences, srtd_cand_ent = self.sort_found_entities(
                candidate_entities, candidate_names, entity, context)
            if template_found:
                entities_ids = self.filter_entities(entities_ids,
                                                    template_found)

        return entities_ids, confidences

    def cut_entity_substr(self, entity: str):
        word_tokens = nltk.word_tokenize(entity.lower())
        word_tokens = [
            word for word in word_tokens if word not in self.stopwords
        ]
        normal_form_tokens = [
            self.morph.parse(word)[0].normal_form for word in word_tokens
        ]
        words_with_freq = [(word, self.nouns_dict.get(word, 0.0))
                           for word in normal_form_tokens]
        words_with_freq = sorted(words_with_freq, key=lambda x: x[1])
        return words_with_freq[0][0]

    def candidate_entities_inverted_index(
            self, entity: str) -> List[Tuple[Any, Any, Any]]:
        word_tokens = nltk.word_tokenize(entity.lower())
        word_tokens = [
            word for word in word_tokens if word not in self.stopwords
        ]
        candidate_entities = []

        for tok in word_tokens:
            if len(tok) > 1:
                found = False
                if tok in self.inverted_index:
                    candidate_entities += self.inverted_index[tok]
                    found = True

                if self.lemmatize:
                    morph_parse_tok = self.morph.parse(tok)[0]
                    lemmatized_tok = morph_parse_tok.normal_form
                    if lemmatized_tok != tok and lemmatized_tok in self.inverted_index:
                        candidate_entities += self.inverted_index[
                            lemmatized_tok]
                        found = True

                if not found and self.use_prefix_tree:
                    words_with_levens_1 = self.searcher.search(tok, d=1)
                    for word in words_with_levens_1:
                        candidate_entities += self.inverted_index[word[0]]
        candidate_entities = Counter(candidate_entities).most_common()
        candidate_entities = [(entity_num, self.entities_list[entity_num], entity_freq, count) for \
                                                (entity_num, entity_freq), count in candidate_entities]

        return candidate_entities

    def sort_found_entities(
        self,
        candidate_entities: List[Tuple[int, str, int]],
        candidate_names: List[List[str]],
        entity: str,
        context: str = None
    ) -> Tuple[List[str], List[float], List[Tuple[str, str, int, int]]]:
        entities_ratios = []
        for candidate, entity_names in zip(candidate_entities,
                                           candidate_names):
            entity_num, entity_id, num_rels, tokens_matched = candidate
            fuzz_ratio = max(
                [fuzz.ratio(name.lower(), entity) for name in entity_names])
            entities_ratios.append(
                (entity_num, entity_id, tokens_matched, fuzz_ratio, num_rels))

        srtd_with_ratios = sorted(entities_ratios,
                                  key=lambda x: (x[2], x[3], x[4]),
                                  reverse=True)
        if self.use_descriptions:
            log.debug(f"context {context}")
            id_to_score = {
                entity_id: (tokens_matched, score)
                for _, entity_id, tokens_matched, score, _ in
                srtd_with_ratios[:30]
            }
            entity_ids = [
                entity_id for _, entity_id, _, _, _ in srtd_with_ratios[:30]
            ]
            scores = self.entity_ranker.rank_rels(context, entity_ids)
            entities_with_scores = [(entity_id, id_to_score[entity_id][0],
                                     id_to_score[entity_id][1], score)
                                    for entity_id, score in scores]
            entities_with_scores = sorted(entities_with_scores,
                                          key=lambda x: (x[1], x[2], x[3]),
                                          reverse=True)
            entities_with_scores = [entity for entity in entities_with_scores if \
                                   (entity[3] > self.descr_rank_score_thres or entity[2] == 100.0)]
            log.debug(f"entities_with_scores {entities_with_scores[:10]}")
            entity_ids = [entity for entity, _, _, _ in entities_with_scores]
            confidences = [score for _, _, _, score in entities_with_scores]
        else:
            entity_ids = [ent[1] for ent in srtd_with_ratios]
            confidences = [float(ent[2]) * 0.01 for ent in srtd_with_ratios]

        return entity_ids, confidences, srtd_with_ratios

    def candidate_entities_names(
        self, entity: str, candidate_entities: List[Tuple[int, str, int]]
    ) -> Tuple[List[Tuple[int, str, int]], List[List[str]]]:
        entity_length = len(entity)
        candidate_names = []
        candidate_entities_filter = []
        for candidate in candidate_entities:
            entity_num = candidate[0]
            entity_names = []

            entity_names_found = self.q2name[entity_num]
            if len(entity_names_found[0]) < 6 * entity_length:
                entity_name = entity_names_found[0]
                entity_names.append(entity_name)
                if len(entity_names_found) > 1:
                    for alias in entity_names_found[1:]:
                        entity_names.append(alias)
                candidate_names.append(entity_names)
                candidate_entities_filter.append(candidate)

        return candidate_entities_filter, candidate_names

    def inverted_index_builder(self) -> None:
        log.debug("building inverted index")
        entities_set = set()
        id_to_label_dict = defaultdict(list)
        id_to_descr_dict = {}
        label_to_id_dict = {}
        label_triplets = []
        alias_triplets_list = []
        descr_triplets = []
        if self.kb_format == "hdt":
            label_triplets, c = self.doc.search_triples("", self.label_rel, "")
            if self.aliases_rels is not None:
                for alias_rel in self.aliases_rels:
                    alias_triplets, c = self.doc.search_triples(
                        "", alias_rel, "")
                    alias_triplets_list.append(alias_triplets)
            if self.descr_rel is not None:
                descr_triplets, c = self.doc.search_triples(
                    "", self.descr_rel, "")

        if self.kb_format == "sqlite3":
            subject, relation, obj = self.sql_column_names
            query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} '\
                    f'WHERE {relation} = "{self.label_rel}";'
            res = self.cursor.execute(query)
            label_triplets = res.fetchall()
            if self.aliases_rels is not None:
                for alias_rel in self.aliases_rels:
                    query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} '\
                            f'WHERE {relation} = "{alias_rel}";'
                    res = self.cursor.execute(query)
                    alias_triplets = res.fetchall()
                    alias_triplets_list.append(alias_triplets)
            if self.descr_rel is not None:
                query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} '\
                        f'WHERE {relation} = "{self.descr_rel}";'
                res = self.cursor.execute(query)
                descr_triplets = res.fetchall()

        for triplets in [label_triplets] + alias_triplets_list:
            for triplet in triplets:
                entities_set.add(triplet[0])
                if triplet[2].endswith(self.lang_str):
                    label = triplet[2].replace(self.lang_str,
                                               '').replace('"', '')
                    id_to_label_dict[triplet[0]].append(label)
                    label_to_id_dict[label] = triplet[0]

        for triplet in descr_triplets:
            entities_set.add(triplet[0])
            if triplet[2].endswith(self.lang_str):
                descr = triplet[2].replace(self.lang_str, '').replace('"', '')
                id_to_descr_dict[triplet[0]].append(descr)

        popularities_dict = {}
        for entity in entities_set:
            if self.kb_format == "hdt":
                all_triplets, number_of_triplets = self.doc.search_triples(
                    entity, "", "")
                popularities_dict[entity] = number_of_triplets
            if self.kb_format == "sqlite3":
                subject, relation, obj = self.sql_column_names
                query = f'SELECT COUNT({obj}) FROM {self.sql_table_name} WHERE {subject} = "{entity}";'
                res = self.cursor.execute(query)
                popularities_dict[entity] = res.fetchall()[0][0]

        entities_dict = {entity: n for n, entity in enumerate(entities_set)}

        inverted_index = defaultdict(list)
        for label in label_to_id_dict:
            tokens = re.findall(self.re_tokenizer, label.lower())
            for tok in tokens:
                if len(tok) > 1 and tok not in self.stopwords:
                    inverted_index[tok].append(
                        (entities_dict[label_to_id_dict[label]],
                         popularities_dict[label_to_id_dict[label]]))
        self.inverted_index = dict(inverted_index)
        self.entities_list = list(entities_set)
        self.q2name = [
            id_to_label_dict[entity] for entity in self.entities_list
        ]
        self.q2descr = []
        if id_to_descr_dict:
            self.q2descr = [
                id_to_descr_dict[entity] for entity in self.entities_list
            ]

    def filter_entities(self, entities: List[str],
                        template_found: str) -> List[str]:
        if template_found in ["who is xxx?", "who was xxx?"]:
            entities = [
                entity for entity in entities if entity in self.who_entities
            ]
        if template_found in ["what is xxx?", "what was xxx?"]:
            entities = [
                entity for entity in entities
                if entity not in self.who_entities
            ]
        return entities