def fixit(current_object, prefix_dictionary): """ Read the def data structure and replace all string URIs with URIRef entities :param current_object: the piece of the data structure to be fixed :return current_object: the piece repaired in place """ from rdflib import URIRef if isinstance(current_object, dict): for k in current_object.keys(): current_object[k] = fixit(current_object[k], prefix_dictionary) elif isinstance(current_object, list): for i in range(0, len(current_object)): current_object[i] = fixit(current_object[i], prefix_dictionary) elif isinstance(current_object, basestring): if current_object.startswith("http://"): current_object = URIRef(current_object) elif current_object.startswith("xsd:"): current_object = cast_to_rdflib(current_object) elif ':' in current_object: k = current_object.find(':') tag = str(current_object[0:k + 1]) if tag in prefix_dictionary: current_object = URIRef( str(current_object).replace(tag, prefix_dictionary[tag])) return current_object
def relativize(self, uri): base = URIRef(self.base) basedir = URIRef(self.base if base.endswith('/') else base.rsplit('/', 1)[0]) if base is not None: if uri == base: uri = URIRef('') elif uri == basedir: uri = URIRef('.') elif uri.startswith(basedir + '/'): uri = URIRef(uri.replace(basedir + '/', "", 1)) return uri
def make_curie(uri: URIRef) -> str: HTTP = 'http' HTTPS = 'https' curie = contract(uri) if curie is not None: return curie if uri.startswith(HTTPS): uri = HTTP + uri[len(HTTPS):] elif uri.startswith(HTTP): uri = HTTPS + uri[len(HTTP):] curie = contract(uri) if curie is None: return uri else: return curie
def relativize(self, uri): base = URIRef(self.base) basedir = URIRef( self.base if base.endswith('/') else base.rsplit('/', 1)[0]) if base is not None: if uri == base: uri = URIRef('') elif uri == basedir: uri = URIRef('.') elif uri.startswith(basedir + '/'): uri = URIRef(uri.replace(basedir + '/', "", 1)) return uri
def convert(self, name, qname, attrs): if name[0] is None: name = URIRef(name[1]) else: name = URIRef("".join(name)) atts = {} for (n, v) in attrs.items(): #attrs._attrs.iteritems(): # if n[0] is None: att = URIRef(n[1]) else: att = URIRef("".join(n)) if att.startswith(XMLNS) or att[0:3].lower() == "xml": pass elif att in UNQUALIFIED: #if not RDFNS[att] in atts: atts[RDFNS[att]] = v else: atts[URIRef(att)] = v return name, atts
def convert(self, name, qname, attrs): if name[0] is None: name = URIRef(name[1]) else: name = URIRef("".join(name)) atts = {} for (n, v) in attrs.items(): #attrs._attrs.iteritems(): # if n[0] is None: att = URIRef(n[1]) else: att = URIRef("".join(n)) if att.startswith(XMLNS) or att[0:3].lower()=="xml": pass elif att in UNQUALIFIED: #if not RDFNS[att] in atts: atts[RDFNS[att]] = v else: atts[URIRef(att)] = v return name, atts
def fixit(current_object): """ Read the def data structure and replace all string URIs with URIRef entities :param current_object: the piece of the data structure to be fixed :return current_object: the piece repaired in place """ from rdflib import URIRef if isinstance(current_object, dict): for k in current_object.keys(): current_object[k] = fixit(current_object[k]) elif isinstance(current_object, list): for i in range(0, len(current_object)): current_object[i] = fixit(current_object[i]) elif isinstance(current_object, basestring): if current_object.startswith("http://"): current_object = URIRef(current_object) elif current_object.startswith("xsd:"): current_object = cast_to_rdflib(current_object) return current_object
def fixit(current_object, prefix_dictionary): """ Read the def data structure and replace all string URIs with URIRef entities :param current_object: the piece of the data structure to be fixed :return current_object: the piece repaired in place """ from rdflib import URIRef if isinstance(current_object, dict): for k in current_object.keys(): current_object[k] = fixit(current_object[k], prefix_dictionary) elif isinstance(current_object, list): for i in range(0, len(current_object)): current_object[i] = fixit(current_object[i], prefix_dictionary) elif isinstance(current_object, basestring): if current_object.startswith("http://"): current_object = URIRef(current_object) elif current_object.startswith("xsd:"): current_object = cast_to_rdflib(current_object) elif ':' in current_object: k = current_object.find(':') tag = str(current_object[0:k + 1]) if tag in prefix_dictionary: current_object = URIRef(str(current_object).replace(tag, prefix_dictionary[tag])) return current_object
def turn_into_mp(row, dataset): # Claim claim_subj = FOODHKG_INST[get_hash(row['Claim'])] pred = RDF['type'] obj = URIRef('http://purl.org/mp/Claim') dataset.add((claim_subj, pred, obj)) # define the claim label dataset.add((claim_subj, RDFS['label'], Literal(row['Claim']))) opinion_subj = FOODHKG_INST[get_hash(row['EFSA Opinion Reference'])] dataset.add((opinion_subj, FOODHKG_PROPS['advises'], claim_subj)) dataset.add((opinion_subj, RDF['type'], FOODHKG_CLS['Opinion'])) # micropublication (a MP conssists of claim, statement, representation of fine-grained NP) # each ESFA opinion is a MP mp_subj = FOODHKG_INST[get_hash(row['EFSA Opinion Reference'] + row['Claim'])] dataset.add((mp_subj, RDF['type'], MP['Micropublication'])) # each MP argues a claim dataset.add((mp_subj, MP['argues'], claim_subj)) # to define fine-granular facts (triples facts) using Nanopublication (NP) hr_subj = FOODHKG_INST[get_hash(row['Health relationship'] + row['Phenotype'] + row['Food'])] dataset.add((mp_subj, MP['represents'], hr_subj)) # np_subj = FOODHKG_INST[get_hash(row['Health relationship']+row['EFSA Opinion Reference'])] # dataset.add((np_subj, RDF['type'], NP['Nanopublication'])) # Assertions for NP # dataset.add((np_subj, NP['hasAssertion'], hr_subj)) dataset.add((hr_subj, RDFS['label'], Literal(row['Health relationship']))) # sub type of food health effect/categorization hr_type = FOODHKG_INST[get_hash(row['Health relationship'])] dataset.add((hr_subj, RDF['type'], hr_type)) dataset.add((hr_type, RDF['type'], FOODHKG_CLS['FoodHealthEffect'])) dataset.add((hr_type, RDFS['label'], Literal(row['Health relationship']))) if str(row['Phenotype Ontology Term']) == 'nan': pheno_uri = FOODHKG_INST[get_hash(row['Phenotype'])] dataset = createPhenotypeRelTriples(dataset, hr_subj, pheno_uri, row['Phenotype']) else: for pheno_uri in row['Phenotype Ontology Term'].split(';'): pheno_uri = pheno_uri.strip() if pheno_uri == '': continue # pheno_uri = normalize(pheno_uri) pheno_uri = URIRef(pheno_uri) dataset = createPhenotypeRelTriples(dataset, hr_subj, pheno_uri, row['Phenotype']) if str(row['Food Ontology Term']) == 'nan': fooduri = FOODHKG_INST[get_hash(row['Food'])] dataset = createFoodRelTriples(dataset, hr_subj, fooduri, row['Food Type'], row['Food']) else: for fooduri in row['Food Ontology Term'].split(';'): fooduri = fooduri.strip() if fooduri == '': continue fooduri = URIRef(fooduri) dataset = createFoodRelTriples(dataset, hr_subj, fooduri, row['Food Type'], row['Food']) if row['Target population'] != '': if row['Target population ontology term'] != '': if row['Target population ontology term'] == 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C18241': targetPopUri = URIRef( 'http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C18241' ) else: targetPopUri = FOODHKG_INST[get_hash( row['Target population ontology term'])] tp_text = row['Target population ontology term'].split('\n') for tp in tp_text: tp_tuple = tp.split(': ') if len(tp_tuple) != 2: continue # print(tp_tuple) pred = tp_tuple[0] obj = tp_tuple[1].strip() if not obj.startswith('http'): dataset.add((targetPopUri, PICO[pred], Literal(obj))) else: dataset.add( (targetPopUri, FOODHKG_PROPS[pred], URIRef(obj))) dataset.add( (targetPopUri, RDFS['label'], Literal(row['Target population']))) dataset.add( (hr_subj, FOODHKG_PROPS['hasTargetPopulation'], targetPopUri)) for i in range(1, 9): if str(row[f'Supporting Evidence Text {i}']) == 'nan': continue supp_subj = FOODHKG_INST[get_hash( row[f'Supporting Evidence Text {i}'])] pred = MP['supports'] # statemtent supports the claim dataset.add((supp_subj, pred, claim_subj)) # is type of Statement dataset.add((supp_subj, RDF['type'], MP['Statement'])) # label of Statement dataset.add((supp_subj, RDFS['label'], Literal(row[f'Supporting Evidence Text {i}']))) suppRef = row[f'Supporting Evidence Reference {i}'] # if references for statement exist if str(suppRef) != 'nan' and len( suppRef) > 3 and suppRef.lower() != "no reference": refs = suppRef.split(';') for ref in refs: # print(ref) ref_tuple = ref.split(':') suppref_text = ref_tuple[0] suppref_doi = ref_tuple[1] suppref_subj = FOODHKG_INST[get_hash(suppref_text)] dataset = createSupportingRefTriples(dataset, supp_subj, suppref_subj, suppref_doi, ref) return dataset
def main(args): # load graph g = rdflib.Graph() g.parse(args.input, publicID=URI_TMP, format="xml") # Tripleを含まないgraphをファイルから作成し、そこにTripleを追加していく g2 = rdflib.Graph() g2.parse(args.header, publicID=URI_TMP, format="xml") # bifd.owl g3 = rdflib.Graph() g3.parse(args.bifd, publicID=URI_TMP, format="xml") convert_uris = load_dict(args.subject) convert_ps = load_dict(args.predicate) convert_ps["https://wba-initiative.org/bifd/label"] = str(RDFS.label) # 処理対象のクラスの抽出 このうちのs.tsvに記載のあるものしか最終出力に含めない query_class = g.query( """SELECT ?class WHERE { ?class rdf:type owl:Class. } """) keep_s = set() for c in query_class: keep_s.add(c[0]) query_references = g.query( """SELECT ?uri ?p ?v WHERE { ?uri rdf:type swivt:Subject. ?uri ?p ?v. filter (?p in (property:BibTex-3Ahas_doi, URI("https://wba-initiative.org/noprefix/URLhas"), rdfs:label)) filter (strstarts(str(?uri), "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/-2A")) } """) references = {} references_val = {} references_s_o = {} for x in query_references: p = str(x[1]) if x[0] not in references: references[x[0]] = [p] else: references[x[0]].append(p) references_val["{}\t{}".format(str(x[0]), str(x[1]))] = str(x[2]) for k in references.keys(): predicates = [] for p in references[k]: predicates.append(p) if "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Property-3ABibTex-3Ahas_doi" in predicates: o = references_val["{}\t{}".format(str(k), "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Property-3ABibTex-3Ahas_doi")] references_s_o[str(k.split("/")[-1])] = o continue if "https://wba-initiative.org/noprefix/URLhas" in predicates: o = references_val["{}\t{}".format(str(k), "https://wba-initiative.org/noprefix/URLhas")] references_s_o[str(k.split("/")[-1])] = o continue if str(RDFS.label) in predicates: o = references_val["{}\t{}".format(str(k), str(RDFS.label))] references_s_o[str(k.split("/")[-1])] = o continue if True: print("Error: no info for references provided.") exit(1) obo_id_dict = {} for s, p, o in g: if s not in keep_s: continue if str(s) in convert_uris.keys(): s = URIRef(convert_uris[str(s)]) if str(o) in convert_uris.keys(): o = URIRef(convert_uris[str(o)]) if str(p) in convert_ps.keys(): p = URIRef(convert_ps[str(p)]) if str(p) == "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Property-3AOBO_ID": obo_id_dict[str(s)] = str(o) if str(s) in convert_uris.values() and (p == RDFS.subClassOf or p == RDFS.label or str(p).startswith(BIFD_PREFIX) or o == OWL.Class): g2.add((s, p, o)) for s, p, o in g: if s not in keep_s: continue if str(s) in convert_uris.keys(): s = URIRef(convert_uris[str(s)]) if str(p) == "http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Property-3AOBO_ID": if str(s) in obo_id_dict.keys(): reg = re.compile(r'^[a-zA-Z_][\w.-]*$') if reg.match(obo_id_dict[str(s)]): # check if it results in a valid uri if str(s) in convert_uris.values(): g2.add((s, OWL.sameAs, URIRef("http://purl.obolibrary.org/obo/{}".format(obo_id_dict[str(s)])))) query_object_property = g3.query( """SELECT ?op WHERE { ?op rdf:type owl:ObjectProperty. }""") object_properties = set() for res in query_object_property: p = str(res[0]).strip("/") object_properties.add(p) for s, p, o in g2: if str(p) == 'https://wba-initiative.org/bifd/reference': k = o.replace("http://183.181.89.140/mediawiki/index.php/Special:URIResolver/", '') if k in references_s_o.keys(): g2.add((s, p, Literal(references_s_o[k], datatype=XSD.string))) g2.remove((s, p, o)) if str(p) == 'https://wba-initiative.org/bifd/taxon': g2.add((s, p, Literal("http://purl.obolibrary.org/obo/{}".format(obo_id_dict[str(o)]), datatype=XSD.string))) if str(p) in convert_ps.values() and p != RDFS.label and str(p) in object_properties: # プロパティの制約条件の変換 if str(p) == "https://wba-initiative.org/bifd/transmitter" or str(p) == "https://wba-initiative.org/bifd/modType": continue g2.remove((s, p, o)) blank_node = BNode() g2.add((s, RDFS.subClassOf, blank_node)) g2.add((blank_node, RDF.type, OWL.Restriction)) g2.add((blank_node, OWL.onProperty, p)) g2.add((blank_node, OWL.someValuesFrom, o)) for s, p, o in g2: if o.startswith("http://183.181.89.140/mediawiki/index.php/Special:URIResolver"): g2.remove((s, p, o)) # s.tsvに含まれる変換対象のURIではない、oのURIの変換を正規表現ベースでやる o = URIRef(o.replace("http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Category-3ABIF-3A", "https://wba-initiative.org/bifd/") \ .replace("http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Category-3A", "http://wba-initiative.org/wbra/") # Glutamateは特別扱い .replace("http://183.181.89.140/mediawiki/index.php/Special:URIResolver/Glutamate", "https://wba-initiative.org/bifd/Glutamate")) g2.add((s, p, o)) g2.serialize(args.output, publicID=URI_TMP, format="pretty-xml")
def _is_datatype(uri: rdflib.URIRef): if isinstance(uri, rdflib.BNode): return False return uri.startswith(str(XSD)) or uri.startswith(str(RDF))
def fileForUri(dirUriMap: DirUriMap, ctx: URIRef) -> bytes: assert isinstance(ctx, URIRef), ctx for d, prefix in dirUriMap.items(): if ctx.startswith(prefix): return d + ctx[len(prefix):].encode('utf8') + b'.n3' raise ValueError("don't know what filename to use for %s" % ctx)
def retrieve_graph_from_dbpedia(term): assert ONLINE_ENABLED logger.info('online access - DBpedia: {term}'.format(term=unicode(term))) term_utf = term.encode('utf-8') term_url = quote_plus(term_utf, safe=str("/:#,()'")) #print '---' #print 'term_url', term_url sparql = SPARQLWrapper("http://dbpedia.org/sparql") #query = """ # SELECT ?p ?o # WHERE {{ <{term_url}> ?p ?o }} #""".format(term_url=term_url) query = """ SELECT ?p ?o WHERE {{ <{term_url}> ?p ?o FILTER( STRSTARTS(STR(?p), "{foaf}") || STRSTARTS(STR(?p), "{rdf}") || STRSTARTS(STR(?p), "{rdfs}") || STRSTARTS(STR(?p), "{dcterms}") || STRSTARTS(STR(?p), "{ontology}")) FILTER (isURI(?o) || langMatches(lang(?o), "EN")) }} """.format(term_url=term_url, foaf=unicode(FOAF), rdf=unicode(RDF), rdfs=unicode(RDFS), dcterms=unicode(DCTERMS), ontology=unicode(ONTOLOGY)) sparql.setQuery(query.encode('utf-8')) sparql.setReturnFormat(JSON) try: results = sparql.query() # workaround for "Invalid \escape" error which can be raised by # convert() body = results.response.read() results = cjson.decode(body) except HTTPError as exc: # can occur if DBpedia is under maintenance (quite often) logger.error('Getting graph for {term} failed; {message}; {excType}' .format(term=term, message=exc.message, excType=unicode(type(exc)))) return None # create graph and bind relevant namespaces graph = Graph() for prefix, namespace in NAMESPACES_DICT.items(): graph.bind(prefix, namespace) LITERAL_MAX_LENGTH = 600 for result in results["results"]["bindings"]: try: p = URIRef(result['p']['value']) # filter wikiPageRevisionID, wikiPageExternalLike etc. if p.startswith(ONTOLOGY['wiki']): continue if result['o']['type'] == 'uri': o = URIRef(result['o']['value']) else: o = Literal(result['o']['value']) # if object is too long (e.g. abstract, ignore it) if len(o) > LITERAL_MAX_LENGTH: continue graph.add((term, p, o)) #print type(p), p #print type(o), o #print '*' except KeyError: continue # check if the graph is not empty if not graph: logger.warning('Retrieved empty graph for ' + unicode(term)) return graph
dict_s = {} dict_p = {} dict_t = {} print("loading dictionaries") f = open(args.input, "rb") for line in f: if line: m = pattern.match(line) try: s, p, o = m.group(1), m.group(2), m.group(3) s, p, o = URIRef(s), URIRef(p), URIRef(o) if not args.nocat or p != dctSubject: if s.startswith(DBR): if s not in dict_s: dict_s[s] = len(dict_s) if o.startswith(DBR): if o not in dict_s: dict_s[o] = len(dict_s) if (p == RDF.type or p == "a" ) and not filter_entity(s) and not filter_entity(o): if s not in dict_s: dict_s[s] = len(dict_s) if o not in dict_t: dict_t[o] = len(dict_t) elif p == RDFS.subClassOf: if s not in dict_t: dict_t[s] = len(dict_t)