def test_generate(): path = "tests" #dir_path = os.path.dirname(os.path.realpath(path)) + "/" + path dir_path = os.getcwd() + "/tests" src_path = dir_path + "/test2.turtle" dest_path = dir_path + "/test2.hdt" print("Source: %s" % src_path) print("Dest : %s" % dest_path) doc = hdt.generate_hdt(src_path, "aff4://foo") retcode = doc.save_to_hdt(dest_path) assert retcode == 0 document = HDTDocument(dest_path) (triples, triplesCard) = document.search_triples("missingtriple", "", "") assert triplesCard == 0 (triples, triplesCard) = document.search_triples("", "", "") assert triplesCard == 12 (triples, triplesCard) = document.search_triples( "aff4://5aea2dd0-32b4-4c61-a9db-677654be6f83//test_images/AFF4-L/dream.txt", "", "") assert triplesCard == 12 os.unlink(dest_path)
def extract_wikidata(classname, typeproperty): doc = HDTDocument("kg/wikidata-20170313-all-BETA.hdt") wd = "http://www.wikidata.org/entity/" wdt = "http://www.wikidata.org/prop/direct/" wd_classes = { "BoxerWikidata" : "Q11338576", "CyclistWikidata": "Q2309784", "CapitalWikidata" : "Q5119", "CountryWikidata" : "Q6256", "MetroAreaWikidata" : "Q1907114", "GeographicRegionWikidata" : "Q82794", "FilmFestivalWikidata" : "Q220505", } edgelist = [] instances = set() (triples, count) = doc.search_triples("", f"{wdt}{typeproperty}", f"{wd}{wd_classes[classname]}") for triple in triples: instances.add(triple[0]) for instance in tqdm(instances, total=len(instances)): (triples, count) = doc.search_triples(instance, "", "") for triple in triples: if not triple[1] in blacklist: edgelist.append((triple[0], triple[1])) return list(set(edgelist)) # Exclude duplicate entity-property relations
def __init__(self, dataset_name='lcquad'): ''' Setup models, indices, embeddings and connection to the KG through the HDT API ''' # connect to the entity and predicate catalogs self.e_index = IndexSearch('dbpedia201604e') self.p_index = IndexSearch('dbpedia201604p') # load embeddings self.word_vectors = load_embeddings(embeddings_path, embeddings_choice) self.p_vectors = load_embeddings(embeddings_path, 'fasttext_p_labels') # load pre-trained question type classification model with open(model_path+'qtype_lcquad_%s.pkl'%(embeddings_choice), 'rb') as f: self.model_settings = pkl.load(f) self.qt_model = build_qt_inference_model(self.model_settings) self.qt_model.load_weights(model_path+'_qtype_weights.best.hdf5', by_name=True) # load pre-trained question parsing model with open(model_path+'lcquad_%s.pkl'%(embeddings_choice), 'rb') as f: ep_model_settings = pkl.load(f) self.ep_model = build_ep_inference_model(ep_model_settings) # load weights # ep_model.load_weights('checkpoints/_'+modelname+'_weights.best.hdf5', by_name=True) self.ep_model.load_weights(model_path+'2hops-types.h5', by_name=True) # connect to the knowledge graph hdt file self.kg = HDTDocument(hdt_path+hdt_file)
def extract_by_instance(fn, wdt_class, property, out=True): doc = HDTDocument(fn) wd = "http://www.wikidata.org/entity/" wdt = "http://www.wikidata.org/prop/direct/" properties = {"instance_of": "P31", "occupation": "P106"} instances = set() (triples, count) = doc.search_triples("", f"{wdt}{properties[property]}", f"{wd}{wdt_class}") for triple in tqdm(triples, len(list(instances))): instances.add(triple[0]) with open(f'{wdt_class}.csv', "w") as csvfile: spamwriter = csv.writer(csvfile, delimiter=',') for instance in tqdm(instances, total=len(instances)): if out: pattern = (instance, "", "") else: pattern = ("", "", instance) (triples, count) = doc.search_triples(*pattern) for triple in triples: if out: spamwriter.writerow([triple[0], triple[1]]) else: spamwriter.writerow([triple[2], triple[1]])
def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ): self.hdt = HDTDocument(path_hdt) self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" self.id_subClassOf = self.hdt.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass" self.id_equivalentClass = self.hdt.convert_term( "http://www.w3.org/2002/07/owl#equivalentClass", IdentifierPosition.Predicate) self.subPropertyOf = "http://www.w3.org/2000/01/rdf-schema#subPropertyOf" self.id_subPropertyOf = self.hdt.convert_term( "http://www.w3.org/2000/01/rdf-schema#subPropertyOf", IdentifierPosition.Predicate) self.equivalentProperty = "http://www.w3.org/2002/07/owl#equivalentProperty" self.id_equivalentProperty = self.hdt.convert_term( "http://www.w3.org/2002/07/owl#equivalentProperty", IdentifierPosition.Predicate) self.graph = nx.DiGraph() self.equi_graph_manager = None #equiClassManager(path_eq) print('set up the equivalence class manager') self.diagnosed_relations = [] # the result self.suggestion_on_relations = [ ] # from the manual decison and Joe's sameAs data. Triple self.leaf_classes = set() print('finished initialization')
class HDT(KG): def __init__(self, hdt_file_path: str): self.hdt = HDTDocument(hdt_file_path) def predicate_objects(self, subject: str) -> Iterator[Tuple[str, str]]: (triples, cardinality) = self.hdt.search_triples(subject, "", "") for s, p, o in triples: yield p, o def subjects(self, predicate: str, obj: str) -> Iterator[str]: (triples, cardinality) = self.hdt.search_triples("", predicate, obj) for s, p, o in triples: yield s def triples(self, subject: str, predicate: str, obj: str) -> Iterator[Tuple[str, str, str]]: (triples, cardinality) = self.hdt.search_triples(subject, predicate, obj) for s, p, o in triples: yield (s, p, o) def objects(self, subject: str, predicate: str) -> Iterator[str]: (triples, cardinality) = self.hdt.search_triples(subject, predicate, "") for s, p, o in triples: yield o def count(self, subject: str, predicate: str, obj: str) -> int: (triples, cardinality) = self.hdt.search_triples(subject, predicate, obj) return cardinality def total_triples(self) -> int: return self.hdt.total_triples def nb_subjects(self) -> int: return self.hdt.nb_subjects def nb_predicates(self) -> int: return self.hdt.nb_predicates def nb_objects(self) -> int: return self.hdt.nb_objects def nb_shared(self) -> int: return self.hdt.nb_shared def get_schema_description(self, resource: str) -> Optional[str]: """Get english description of the specified resource. Use the http://schema.org/description property. Trailing double quotes and @en are removed!""" for o in self.objects(resource, "http://schema.org/description"): if o.endswith("@en"): # delete trailing @en and double quotes input_str = o[1:len(o) - 4] # input_str = re.sub(r'\d+', '', input_str) # remove numbers # input_str = input_str.translate(str.maketrans("","", string.punctuation)) # Punctuation removal # input_str = input_str.strip().lower() # To remove leading and ending spaces and put it in lower case return input_str return None
def __init__(self, hdt_path: Optional[str] = None, graph: Optional[HDTDocument] = None, redis_client: Optional[redis.Redis] = None): self.cache = redis_client if graph: self.graph = graph else: self.graph = HDTDocument(hdt_path, map=False, progress=True)
def load_KG(path_file, predicate_string, orientation=True): # load the file according to the given predicate hdt_file = HDTDocument(path_file) (triples, cardinality) = hdt_file.search_triples('', predicate_string, '') for (s, _, o) in triples: if orientation: graph.add_edge(s, o) else: graph.add_edge(o, s)
def get_nb_triples(file_path: str, format: str) -> int: if format == 'nt': return wccount(file_path) elif format == 'hdt': doc = HDTDocument(file_path, indexed=False) _, nb_triples = doc.search_triples("", "", "") return nb_triples else: raise Exception(f'Unsupported RDF format: "{format}"')
def filter_answer_by_class(classes, answers_ids): classes_ids = [_id for e in classes for _id in e] kg = HDTDocument(hdt_path+hdt_file) a_ids = [_id for e in answers_ids for _id in e] a_ids = kg.filter_types(a_ids, classes_ids) kg.remove() a_ids = [_id for _a_ids in a_ids for _id in _a_ids] answers_ids = [{_id: a_score} for e in answers_ids for _id, a_score in e.items() if _id in a_ids] return answers_ids
def parsefile(self, file_path): """Parse an HDT file as an N-Triples file.""" from hdt import HDTDocument doc = HDTDocument(file_path, indexed=False) iterator, _ = doc.search_triples("", "", "") self.iterator = iterator self.parse()
def __init__(self, wiki_filename: str, **kwargs) -> None: """ Args: wiki_filename: hdt file with wikidata **kwargs: """ log.debug(f'__init__ wiki_filename: {wiki_filename}') wiki_path = expand_path(wiki_filename) self.document = HDTDocument(str(wiki_path))
def __init__(self, **kwargs): self.__source = kwargs.get("file", None) if self.__source is None: raise TypeError() try: self.document = HDTDocument(self.__source) except Exception as e: logger.exception("Could not load HDT File from {}.".format( self.__source)) raise e self.card = None
def generate(): hdt_file = None output_filename = None if sys.argv [1] == 'lod': hdt_file = HDTDocument(PATH_LOD) output_filename = 'all_lod_subClassOf.csv' else: hdt_file = HDTDocument(PATH_DBpedia) output_filename = 'all_dbpedia_subClassOf.csv' subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) count = 0 with open(output_filename, 'w', newline='') as file: (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"]) for (s, p, o) in subclass_triples: # store it in a csv file s_id = hdt_file.convert_term(s, IdentifierPosition.Subject) o_id = hdt_file.convert_term(o, IdentifierPosition.Object) writer.writerow([s_id, s, o_id, o]) # print ([s_id, s, o_id, o]) count += 1 print ('total entries = ', count)
def __init__(self, path_hdt=PATH_LOD): self.hdt = HDTDocument(path_hdt) self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" self.id_subClassOf = self.hdt.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass" self.id_equivalentClass = self.hdt.convert_term( "http://www.w3.org/2002/07/owl#equivalentClass", IdentifierPosition.Predicate) self.graph = nx.DiGraph() self.equi_graph = nx.Graph() self.diagnosed_relations = {} self.diagnosed_classes = {} self.leaf_classes = set()
def extract_classes(fn): doc = HDTDocument(fn) rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" types = set() types_dct = {} (triples, count) = doc.search_triples("", rdf_type, "") for triple in tqdm(triples, total=count): types.add(triple[2]) for type in tqdm(types): (instances, instance_count) = doc.search_triples("", rdf_type, type) types_dct[type] = instance_count return types_dct
def get_rdf_reader(file_path, format='nt'): """Get an iterator over RDF triples from a file""" iterator = None nb_triples = 0 # load standard RDF formats using rdflib if format == 'nt' or format == 'ttl': g = Graph() g.parse(file_path, format=format) nb_triples = len(g) iterator = map(__n3_to_str, g.triples((None, None, None))) elif format == 'hdt': # load HDTDocument without additional indexes # they are not needed since we only search by "?s ?p ?o" doc = HDTDocument(file_path, indexed=False) iterator, nb_triples = doc.search_triples("", "", "") return iterator, nb_triples
def __init__(self, wiki_filename: str, lang: str = "@en", **kwargs) -> None: """ Args: wiki_filename: hdt file with wikidata lang: Russian or English language **kwargs: """ log.debug(f'__init__ wiki_filename: {wiki_filename}') wiki_path = expand_path(wiki_filename) self.description_rel = "http://schema.org/description" self.lang = lang self.document = HDTDocument(str(wiki_path))
def loadMetadata(self, zip): # Load the turtle metadata. aff4cache = os.path.join(expanduser("~"), ".aff4") if not os.path.exists(aff4cache): try: os.makedirs(aff4cache) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise cached_turtle = os.path.join(aff4cache, "%s.hdt" % str(zip.urn)[7:]) if not os.path.exists(cached_turtle): self.createHDTviaLib(zip, cached_turtle) if os.path.exists(cached_turtle): # assume we have a HDT cache of turtle at this point self.hdt = HDTDocument(cached_turtle)
def load(): output_filename = None if sys.argv [1] == 'lod': hdt_file = HDTDocument(PATH_LOD) output_filename = 'all_lod_subClassOf.csv' else: hdt_file = HDTDocument(PATH_DBpedia) output_filename = 'all_dbpedia_subClassOf.csv' with open(output_filename, newline='') as csvfile: reader = csv.DictReader(csvfile) for row in reader: print(row['SUBJECT_ID'], row['OBJECT_ID']) print(row['SUBJECT'], row['OBJECT']) print ('----------------------------------------------')
def evaluate_subgraph_extraction(nhops, e_field, p_field, limit=None, show_errors=False): ''' e_field, p_field <str> names of the fields in MongoDB to look up the IDs ''' samples = mongo.get_sample(limit=limit) # iterate over the cursor accs = [] for doc in samples: # get correct entities and predicates from the GS annotations e_ids = doc[e_field] p_uris = doc[p_field] # extract the subgraph kg = HDTDocument(hdt_path + hdt_file) kg.configure_hops(nhops, p_uris, namespace, True) entities, _, _ = kg.compute_hops(e_ids) kg.remove() # check if we hit the answer set if 'answers_ids' in doc: correct_answers_ids = set(doc['answers_ids']) # print(correct_answers_ids) n_hits = len(correct_answers_ids & set(entities)) # accuracy acc = float(n_hits) / len(correct_answers_ids) accs.append(acc) if show_errors & (acc < 1): print(doc['question']) print(doc['entity_ids']) print(doc['predicate_uris']) return accs
class HDTFileConnector(DatabaseConnector): """A HDTFileConnector search for RDF triples in a HDT file""" def __init__(self, file): super(HDTFileConnector, self).__init__() self._hdt = HDTDocument(file) def search_triples(self, subject, predicate, obj, limit=0, offset=0): """ Get an iterator over all RDF triples matching a triple pattern. Args: - subject ``string`` - Subject of the triple pattern - predicate ``string`` - Predicate of the triple pattern - object ``string`` - Object of the triple pattern - limit ``int=0`` ``optional`` - LIMIT modifier, i.e., maximum number of RDF triples to read - offset ``int=0`` ``optional`` - OFFSET modifier, i.e., number of RDF triples to skip Returns: A Python iterator over RDF triples matching the given triples pattern """ subject = subject if (subject is not None) and ( not subject.startswith('?')) else "" predicate = predicate if (predicate is not None) and ( not predicate.startswith('?')) else "" obj = obj if (obj is not None) and (not obj.startswith('?')) else "" return self._hdt.search_triples(subject, predicate, obj, offset=offset, limit=limit) @property def nb_triples(self): return self._hdt.total_triples @property def nb_subjects(self): """Get the number of subjects in the database""" return self._hdt.nb_subjects @property def nb_predicates(self): """Get the number of predicates in the database""" return self._hdt.nb_predicates @property def nb_objects(self): """Get the number of objects in the database""" return self._hdt.nb_objects def from_config(config): """Build a HDTFileFactory from a config file""" if not os.path.isfile(config["file"]): raise Exception("Configuration file not found: {}".format( config["file"])) return HDTFileConnector(config["file"])
def __init__(self, wiki_filename: str, file_format: str = "hdt", lang: str = "@en", **kwargs) -> None: """ Args: wiki_filename: file with Wikidata file_format: format of Wikidata file lang: Russian or English language **kwargs: """ self.description_rel = "http://schema.org/description" self.file_format = file_format self.wiki_filename = str(expand_path(wiki_filename)) if self.file_format == "hdt": self.document = HDTDocument(self.wiki_filename) elif self.file_format == "pickle": self.document = load_pickle(self.wiki_filename) else: raise ValueError("Unsupported file format") self.lang = lang
def extract_dbpedia(superclass): """ Get edgelist for superclass and all its subclasses """ edgelist = [] instances = set() doc = HDTDocument(run.config["kg_source"]) subject_limit = run.config["subject_limit"] predicate_limit = run.config["predicate_limit"] subclasses = query_subclasses(superclass) print("[Info] query instances for each subclass") for subclass in tqdm(subclasses): if subject_limit > 0: (triples, count) = doc.search_triples("", rdf + "type", subclass, limit=subject_limit) else: (triples, count) = doc.search_triples("", rdf + "type", subclass) for triple in triples: instances.add(triple[0]) print("[Info] query predicates for each instance") for subject in tqdm(instances): if predicate_limit > 0: triples = doc.search_triples(subject, "", "", limit=predicate_limit)[0] else: (triples, count) = doc.search_triples(subject, "", "") for triple in triples: # Either blacklist if not triple[1] in blacklist: edgelist.append((triple[0], triple[1])) # Or whitelist # if triple[1] in whitelist: # edgelist.append((triple[0], triple[1])) return list(set(edgelist)) # Exclude duplicate entity-property relations
def get_rdf_reader(file_path, format='nt'): """Get an iterator over RDF triples from a file""" iterator = None nb_triples = 0 # load using rdflib if format == 'ttl': g = Graph() g.parse(file_path, format=format) nb_triples = len(g) iterator = map(__n3_to_str, g.triples((None, None, None))) elif format == 'nt': print('Counting triples using the wc command...') total = wccount(file_path) print('The file contains {} triples.'.format(total)) f = open(file_path, 'r') iter = yield_triples(f) return iter, total, f elif format == 'hdt': # load HDTDocument without additional indexes (not needed since we do a ?s ?p ?o) doc = HDTDocument(file_path, True, True) iterator, nb_triples = doc.search_triples_bytes("", "", "") return iterator, nb_triples
def __init__(self, config, hdt_file='wikidata2018_09_11.hdt', topk_entities=10, bottleneck_dim=32): super(MessagePassingHDTBert, self).__init__(config) # entity matching Transformer self.bert = DistilBertModel(config) self.dropout = nn.Dropout(config.dropout) self.pre_classifier = nn.Linear(config.hidden_size, bottleneck_dim) self.classifier = nn.Linear(bottleneck_dim, self.config.num_labels) # initialise connection to the Wikidata KG through the HDT API kg = HDTDocument(hdt_path + hdt_file) # sampling layer with subgraph retrieval self.subgraph_sampling = SamplingLayer(kg, topk_entities) # predicted scores are propagated via MP layer into the entity subser distribution defined by the subgraph self.mp = MPLayer() self.init_weights()
# hdt_iterators_test.py # Author: Thomas MINIER - MIT License 2017-2019 import pytest from hdt import HDTDocument path = "tests/test.hdt" document = HDTDocument(path) nbTotalTriples = 132 def test_read_document_base(): (triples, cardinality) = document.search_triples("", "", "") assert triples.subject == "?s" assert triples.predicate == "?p" assert triples.object == "?o" assert cardinality == nbTotalTriples for subj, pred, obj in triples: assert subj is not None assert pred is not None assert obj is not None assert triples.nb_reads == cardinality def test_read_document_base_bytes(): (triples, cardinality) = document.search_triples_bytes("", "", "") assert triples.subject == "?s" assert triples.predicate == "?p" assert triples.object == "?o" assert cardinality == nbTotalTriples for subj, pred, obj in triples: assert isinstance(subj, bytes)
# coding: utf-8 import convex as cx import requests import time import spacy nlp = spacy.load("en_core_web_lg") def get_nlp(sentence): return nlp(sentence) from hdt import HDTDocument hdt_wd = HDTDocument("data/kb/wikidata2018_09_11.hdt") #questions = [ # "Which actor voiced the Unicorn in The Last Unicorn?", # "And Alan Arkin was behind...?", # "And Alan Arkin be behind...? Why How when which was happy make fun", # "Who is the composer of the soundtrack?", # "So who performed the songs?", # "Genre of this band's music?", # "By the way, who was the director?" # ] # #q_test = str("Which actor voiced the Unicorn in The Last Unicorn? "+ # "And Alan Arkin was behind...? "+ # "And Alan Arkin be behind...? Why How when which was happy make fun. "+ # "Who is the composer of the soundtrack? "+
#!/usr/bin/env python3 # coding: utf-8 import csv import urllib from hdt import HDTDocument import pandas as pd from constants import SEP from settings import (HDT_FILE, DATASET_FILE, OUTPUT_DATASET_FILE, STATS_FILE, PREDICATES_EXCLUDED, QUERY, RATIO) from functions import get_sujeto_atr, get_predicado_atr, get_objeto_atr # HDTDocument creation document = HDTDocument(HDT_FILE) # Se hace la consulta de los triples en funcion del sujeto/predicado/objeto (triples, cardinality) = document.search_triples("", "", QUERY) def query(query): print("{}: {} objetos.".format(query, cardinality)) #%% Procesamiento # triple = s p o lista_objetos = [] for triple in triples: s, p, o = triple sujeto_descripcion, sujeto_URI = get_sujeto_atr(s) lista_objetos.append(sujeto_URI[1:-1])
_row = np.hstack([row, col]) col = np.hstack([col, row]) row = _row n_edges *= 2 # create adjacency matrix for this predicate data = np.ones(n_edges) adj = sp.csr_matrix((data, (row, col)), shape=adj_shape) sp_adjacencies.append(adj) return np.asarray(sp_adjacencies) from sklearn.preprocessing import normalize, binarize kg = HDTDocument(hdt_path + hdt_file) def hop(entities, constraints, top_predicates, verbose=False, max_triples=500000): ''' Extract the subgraph for the selected entities ''' # print(top_predicates) n_constraints = len(constraints) if entities: n_constraints += 1