def write_mappings(): # Include all pairs: # ll = pair_association(unique_pairs, loglike) # Only pairs with frequency >= 5: unique_pars_5 = np.array([a.split('!') for a, b in pc.items() if b >= 5]) ll = pair_association(unique_pars_5, loglike) print 'Write mappings' lls = sorted(ll.items(), key=lambda x: x[1]) ww = np.array(ll.values()) mn = ww.mean() print "- Mean LL is {:2f}".format(mn) print "- {:.2f} % is >= mean LL".format( float(ww[ww >= mn].shape[0]) / ww.shape[0]) print "- {:.2f} % is < mean LL".format( float(ww[ww < mn].shape[0]) / ww.shape[0]) # Whether to lookup DDC labels and add them to the mapping sheet addDdcLabels = False if addDdcLabels: # Load WebDewey data g = Graph() for x in glob('../../webdewey/DDK23/*.ttl'): print x g.load(x, format='turtle') fsj = re.compile('.*\(Form\)') with open('mappings.csv', 'w') as f: writer = csv.writer(f, delimiter='\t') for x in lls[::-1]: if x[1] < mn: break q = x[0].split('!', 1) if fsj.match(q[0]): # Utelat form continue if addDdcLabels: lab = g.preferredLabel( URIRef('http://dewey.info/class/' + q[1] + '/e23/'), labelProperties=[SKOS.prefLabel, SKOS.altLabel]) if len(lab) != 0: lab = lab[0][1].value else: lab = '(no label)' # Term, Dewey, Dewey Caption, Loglike writer.writerow([q[0], q[1], lab.encode('utf-8'), x[1]]) else: # Term, Dewey, Loglike writer.writerow([q[0], q[1], x[1]])
def write_mappings(): # Include all pairs: # ll = pair_association(unique_pairs, loglike) # Only pairs with frequency >= 5: unique_pars_5 = np.array([a.split('!') for a, b in pc.items() if b >= 5]) ll = pair_association(unique_pars_5, loglike) print 'Write mappings' lls = sorted(ll.items(), key=lambda x: x[1]) ww = np.array(ll.values()) mn = ww.mean() print "- Mean LL is {:2f}".format(mn) print "- {:.2f} % is >= mean LL".format(float(ww[ww >= mn].shape[0]) / ww.shape[0]) print "- {:.2f} % is < mean LL".format(float(ww[ww < mn].shape[0]) / ww.shape[0]) # Whether to lookup DDC labels and add them to the mapping sheet addDdcLabels = False if addDdcLabels: # Load WebDewey data g = Graph() for x in glob('../../webdewey/DDK23/*.ttl'): print x g.load(x, format='turtle') fsj = re.compile('.*\(Form\)') with open('mappings.csv', 'w') as f: writer = csv.writer(f, delimiter='\t') for x in lls[::-1]: if x[1] < mn: break q = x[0].split('!', 1) if fsj.match(q[0]): # Utelat form continue if addDdcLabels: lab = g.preferredLabel(URIRef('http://dewey.info/class/' + q[1] + '/e23/'), labelProperties=[SKOS.prefLabel, SKOS.altLabel]) if len(lab) != 0: lab = lab[0][1].value else: lab = '(no label)' # Term, Dewey, Dewey Caption, Loglike writer.writerow([q[0], q[1], lab.encode('utf-8'), x[1]]) else: # Term, Dewey, Loglike writer.writerow([q[0], q[1], x[1]])
def load_mappings(self, filename, graph=None): tmp = Graph() if graph is None: graph = Graph() tmp.load(filename, format=self.extFromFilename(filename)) skosify.infer.skos_symmetric_mappings(tmp) for tr in tmp.triples_choices((None, [SKOS.exactMatch, SKOS.closeMatch, SKOS.broadMatch, SKOS.narrowMatch, SKOS.relatedMatch], None)): #if tr[0] in all_concepts: graph.add(tr) return graph
def init_model(root_directory, model_file): """ initialize models: 1. check if core model exists (in CWD), create it if necessary; 2. check core model version compatibility; 3. generate node model file based on the content of root_directory. """ if os.path.exists(DEFAULT_CORE_MODEL): SEED_LOG.info('load core model') core_model = Graph() core_model.load(DEFAULT_CORE_MODEL) version = core_model.value(URIRef(SEED_BASE), OWL.versionInfo) SEED_LOG.info('core model version: [%s]' % version) if not version == VERSION: SEED_LOG.error( 'incompatible to program version [%s], need to regenerate.' \ % VERSION) gen_core() else: SEED_LOG.info('version compatible') else: SEED_LOG.error('core model does not exist, need to generate.') gen_core() # generate node model by importing specified root directory root_directory = os.path.abspath(root_directory) if not os.path.exists(root_directory): SEED_LOG.error('directory not exist') return SEED_LOG.info('reading object list ...') object_list = read_tree(root_directory) SEED_LOG.info('creating node model ...') write_model(object_list, model_file) SEED_LOG.info('%d object individuals created in %s.' % \ (len(object_list), model_file))
def stats(task): t0 = time.time() g = Graph() g.load('dist/realfagstermer.complete.ttl', format='turtle') s = json.load(open('realfagstermer.github.io/_data/stats.json', 'r')) current = stats_from_graph(g) current['ts'] = now = int(time.time()) s.append(current) json.dump(current, open('realfagstermer.github.io/_data/stats_current.json', 'w'), indent=2, sort_keys=True) json.dump(s, open('realfagstermer.github.io/_data/stats.json', 'w'), indent=2, sort_keys=True) dt = time.time() - t0 logger.info('Generated stats in %.1f seconds', dt)
def enrich_and_concat(files, out_file): graph = Graph() for sourcefile in files: graph.load(sourcefile, format="turtle") skosify = Skosify() # Enrichments: broader <-> narrower, related <-> related logger.debug("Skosify: Enriching relations") skosify.enrich_relations(graph, False, True, True) with open(out_file + ".tmp", "w") as handle: graph.serialize(handle, format="turtle") os.rename(out_file + ".tmp", out_file) return len(graph)
def load_mappings_from_file(filenames, uri_filter='http'): g = Graph() g.namespace_manager.bind('skos', SKOS) g2 = Graph() for filename in filenames: g2.load(filename, format='turtle') skosify.infer.skos_symmetric_mappings(g2, related=False) skosify.infer.skos_hierarchical_mappings(g2, narrower=True) for tr in g2: if tr[1] in [ SKOS.exactMatch, SKOS.closeMatch, SKOS.relatedMatch, SKOS.broadMatch, SKOS.narrowMatch ]: if tr[0].startswith(uri_filter): g.add(tr) # q[0][0].strip()http://data.ub.uio.no/realfagstermer/c0 return g
def eye(graphs, eye_path="eye", include_proof=False): """ Process a set of graphs with EYE, and return the inferred triples. """ pass_opt = '--pass-only-new' if not include_proof else '--pass-all' out_parser = 'turtle' if not include_proof else 'n3' eyep = Popen([eye_path, '-', pass_opt], stdin=PIPE, stdout=PIPE, stderr=PIPE) for graph in graphs: graph.serialize(eyep.stdin, format='n3') eyep.stdin.close() infered = Graph() infered.load(eyep.stdout, format=out_parser) log = eyep.stderr.read() eyep.wait() return infered, log
def prepare(self): logger.info('Building RDF graph') graph = Graph() for inc in self.include: lg0 = len(graph) graph.load(inc, format=self.extFromFilename(inc)) logger.info(' - Included {} triples from {}'.format(len(graph) - lg0, inc)) try: scheme_uri = next(graph.triples((None, RDF.type, SKOS.ConceptScheme))) except StopIteration: raise Exception('Concept scheme URI could not be found in vocabulary scheme data') scheme_uri = scheme_uri[0] now = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') graph.set((URIRef(scheme_uri), DCTERMS.modified, Literal(now, datatype=XSD.dateTime))) lg0 = len(graph) for resource in self.vocabulary.resources: self.convert_resource(graph, resource, self.vocabulary.resources, scheme_uri, self.vocabulary.default_language.alpha2) logger.info(' - Added {} triples'.format(len(graph) - lg0)) all_concepts = set([tr[0] for tr in graph.triples((None, RDF.type, SKOS.Concept))]) for inc in self.mappings_from: lg0 = len(graph) mappings = self.load_mappings(inc) for tr in mappings.triples((None, None, None)): if tr[0] in all_concepts: graph.add(tr) logger.info(' - Added {} mappings from {}'.format(len(graph) - lg0, inc)) logger.info('Skosify...') self.skosify_process(graph) return {'graph': graph}
def enrich_and_concat(files, out_file): graph = Graph() for sourcefile in files: if sourcefile.endswith('.nt'): graph.load(sourcefile, format='nt') elif sourcefile.endswith('.ttl'): graph.load(sourcefile, format='turtle') else: graph.load(sourcefile) logger.debug("Skosify: Enriching relations") skosify.infer.skos_hierarchical(graph, True) skosify.infer.skos_related(graph) with open(out_file + '.tmp', 'wb+') as handle: graph.serialize(handle, format='turtle') os.rename(out_file + '.tmp', out_file) return len(graph)
if not args: print "USAGE: %s FILE [rdf...]" % p.basename(cmd) print "Where FILE is a local copy of <https://lagen.nu/1976:725>. Get it by doing e.g.:" print " $ /usr/bin/curl -sk 'https://lagen.nu/1976:725' > /tmp/sfs-1976_725.xhtml" print print "If additional local rdf files are supplied, a diff of the " \ "extracted data and the supplied data is output (instead of just the " \ "extracted data)." exit() docpath = args[0] graph = fsdoc_to_graph(docpath) from rdfextras.tools.pathutils import guess_format cmp_graph = Graph() for fpath in args[1:]: cmp_graph.load(fpath, format=guess_format(fpath)) if cmp_graph: from rdflib.compare import graph_diff in_both, in_first, in_second = graph_diff(graph, cmp_graph) print "# %s new statements:" % len(in_first) for pfx, uri in graph.namespaces(): in_first.bind(pfx, uri) print in_first.serialize(format='n3') else: print "# Nothing to compare against. New RDF is:" print graph.serialize(format='n3')
def ttl2solr(infile, outfile, vocab_name=None): logger.info('ttl2solr: Loading %s', infile) g = Graph() g.load(infile, format='turtle') # Build parent lookup hash # logger.debug('Building parent lookup hash') parents = {} for c, p in g.subject_objects(SKOS.broader): c = text_type(c) # to string p = text_type(p) # to string if c not in parents: parents[c] = set() parents[c].add(p) # Build labels lookup hash using two fast passes # logger.debug('Building labels lookup hash') labels = {} for c, p in g.subject_objects(SKOS.altLabel): labels[text_type(c)] = text_type(p) for c, p in g.subject_objects(SKOS.prefLabel): labels[text_type(c)] = text_type(p) # overwrite altLabel with prefLabel if found # logger.debug('Building documents') docs = [] unknown_preds = set() for uriref in g.subjects(RDF.type, SKOS.Concept): doc = {'id': text_type(uriref)} if vocab_name is not None: doc['vocabulary'] = vocab_name for pred, obj in g.predicate_objects(uriref): if pred not in schema: if pred not in unknown_preds: logger.warning('Encountered unknown predicate with no mapping to JSON: %s', pred) unknown_preds.add(pred) continue if pred == SKOS.inScheme and schema[pred] in vocabs: doc['vocab'] = vocabs[schema[pred]] continue if schema[pred] is None: continue if schema[pred] not in doc: doc[schema[pred]] = [] doc[schema[pred]].append(text_type(obj)) # Add labels from broader concepts bcs = [] for bc in get_breadcrumbs([[text_type(uriref)]], parents): bc = [labels.get(x) for x in reversed(bc[1:])] bcs.append('/'.join([x for x in bc if x is not None])) doc['paths'] = bcs byLevel = [[text_type(uriref)]] # Level 0 level = 0 while True: byLevel.append([]) for x in byLevel[level]: byLevel[level + 1].extend(parents.get(x, set())) if len(byLevel[level + 1]) == 0: break level += 1 for level, items in enumerate(byLevel[1:4]): # logger.debug(level, items) doc['parentsLevel{}'.format(level + 1)] = [labels[x] for x in items if x in labels] # Vi mangler labels for enkelt toppetiketter, som f.eks. 'http://data.ub.uio.no/ddc/19' docs.append(doc) logger.info('ttl2solr: Storing %d documents in %s', len(docs), outfile) json.dump(docs, open(outfile, 'w'), indent=2)
def load(self, filename): """ Note: This loader only loads categories and mappings """ graph = Graph() graph.load(filename, format=self.extFromFilename(filename)) logger.info('Read %d triples from %s', len(graph), filename) skosify.infer.skos_symmetric_mappings(graph, related=False) # Load mappings n_mappings = 0 n_memberships = 0 for tr in graph.triples_choices((None, [SKOS.exactMatch, SKOS.closeMatch, SKOS.broadMatch, SKOS.narrowMatch, SKOS.relatedMatch], None)): source_concept = tr[0] res_id = self.vocabulary.id_from_uri(source_concept) if res_id is not None: shortName = str(tr[1]).split('#')[1] try: self.vocabulary.resources[res_id].add('mappings.%s' % shortName, str(tr[2])) n_mappings += 1 except KeyError: logger.warning('Concept not found: %s', res_id) # Load categories for tr in graph.triples((None, RDF.type, UOC.Category)): cat_lab = graph.preferredLabel(tr[0], lang='nb')[0][1].value cat_id = '' + tr[0] cat = Concept().set_type('Category') cat.set('id', cat_id) cat.set('prefLabel.nb', Label(cat_lab)) self.vocabulary.resources.load([cat]) for tr2 in graph.triples((tr[0], SKOS.member, None)): uri = str(tr2[2]) res_id = self.vocabulary.id_from_uri(uri) if res_id is not None: try: self.vocabulary.resources[res_id].add('memberOf', cat_id) n_memberships += 1 except KeyError: logger.warning('Concept not found: %s', res_id) # Load number of ccmapper mapping candidates for tr in graph.triples((None, LOCAL.ccmapperCandidates, None)): source_concept = tr[0] res_id = self.vocabulary.id_from_uri(source_concept) if res_id is not None: shortName = str(tr[1]).split('#')[1] try: self.vocabulary.resources[res_id].set('ccmapperCandidates', int(tr[2])) except KeyError: logger.warning('Concept not found: %s', res_id) # Load ccmapper mapping state for tr in graph.triples((None, LOCAL.ccmapperState, None)): source_concept = tr[0] res_id = self.vocabulary.id_from_uri(source_concept) if res_id is not None: shortName = str(tr[1]).split('#')[1] try: self.vocabulary.resources[res_id].set('ccmapperState', tr[2]) except KeyError: logger.warning('Concept not found: %s', res_id) logger.info('Loaded %d mappings and %d category memberships from %s', n_mappings, n_memberships, filename)
def convert(infile, outfile): logger.debug('Loading %s', infile) g = Graph() g.load(infile, format='turtle') # Build parent lookup hash logger.debug('Building parent lookup hash') parents = {} for c, p in g.subject_objects(SKOS.broader): c = text_type(c) # to string p = text_type(p) # to string if c not in parents: parents[c] = set() parents[c].add(p) # Build labels lookup hash using two fast passes logger.debug('Building labels lookup hash') labels = {} for c, p in g.subject_objects(SKOS.altLabel): labels[text_type(c)] = text_type(p) for c, p in g.subject_objects(SKOS.prefLabel): labels[text_type(c)] = text_type(p) # overwrite altLabel with prefLabel if found logger.debug('Building documents') docs = [] for uriref in g.subjects(RDF.type, SKOS.Concept): doc = {'id': text_type(uriref)} for pred, obj in g.predicate_objects(uriref): if pred not in schema: logger.error('Encountered unknown predicate with no mapping to JSON: %s', pred) continue if pred == SKOS.inScheme and schema[pred] in vocabs: doc['vocab'] = vocabs[schema[pred]] continue if schema[pred] is None: continue if schema[pred] not in doc: doc[schema[pred]] = [] doc[schema[pred]].append(text_type(obj)) # Add labels from broader concepts byLevel = [[text_type(uriref)]] # Level 0 level = 0 while True: byLevel.append([]) for x in byLevel[level]: byLevel[level + 1].extend(parents.get(x, set())) if len(byLevel[level + 1]) == 0: break level += 1 for level, items in enumerate(byLevel[1:-1]): # logger.debug(level, items) doc['parentsLevel{}'.format(level)] = [labels[x] for x in items if x in labels] # Vi mangler labels for enkelt toppetiketter, som f.eks. 'http://data.ub.uio.no/ddc/19' docs.append(doc) logger.debug('Generated %d documents', len(docs)) logger.debug('Saving %s', outfile) json.dump(docs, open(outfile, 'w'), indent=2)
RESULT = Namespace("http://www.w3.org/2002/03owlt/resultsOntology#") FOAF = Namespace("http://xmlns.com/foaf/0.1/") results = Graph() system = BNode("system") results.add((system, FOAF["homepage"], URIRef("http://rdflib.net/"))) results.add((system, RDFS.label, Literal("RDFLib"))) results.add((system, RDFS.comment, Literal(""))) if __name__ == "__main__": manifest = Graph() manifest.load( cached_file( "http://www.w3.org/2000/10/rdf-tests/rdfcore/Manifest.rdf")) import sys, getopt try: optlist, args = getopt.getopt(sys.argv[1:], 'h:', ["help"]) except getopt.GetoptError, msg: write(msg) #usage() try: argv = sys.argv for arg in sys.argv[1:]: verbose = 1 case = URIRef(arg) write(u"Testing: %s" % case) if (case, RDF.type, TEST["PositiveParserTest"]) in manifest:
from __future__ import print_function # Third-party from rdflib import Literal, Namespace from rdflib.graph import Graph NS_FOAF = Namespace("http://xmlns.com/foaf/0.1/") NS_EXIF = Namespace("http://www.w3.org/2003/12/exif/ns#") index = Graph() index.bind("cc", "http://creativecommons.org/ns#") index.bind("dc", "http://purl.org/dc/elements/1.1/") index.bind("dcq","http://purl.org/dc/terms/") index.bind("rdf","http://www.w3.org/1999/02/22-rdf-syntax-ns#") index.bind("foaf","http://xmlns.com/foaf/0.1/") index.load('./rdf/index.rdf') output = Graph() output.bind("foaf","http://xmlns.com/foaf/0.1/") output.bind("exif","http://www.w3.org/2003/12/exif/ns#") for img in index.objects(None, NS_FOAF.logo): print(img) width, height = img[:-len('.png')].split('/')[-1].split('x') output.add( (img, NS_EXIF.width, Literal(width)) ) output.add( (img, NS_EXIF.height, Literal(height)) ) file('./rdf/images.rdf', 'w').write( output.serialize(format="pretty-xml", max_depth=2))
def testModel(self): g = Graph() g.load("http://www.w3.org/2000/10/rdf-tests/rdfcore/rdfms-empty-property-elements/test002.nt", format="nt")
import codecs MADS = Namespace('http://www.loc.gov/mads/rdf/v1#') logger = logging.getLogger() logger.setLevel(logging.DEBUG) formatter = logging.Formatter('[%(asctime)s %(levelname)s] %(message)s') console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logging.info('Loading removed.nt') removed = Graph() removed.load('realfagstermer/removed.nt', format='nt') logging.info('Loading added.nt') added = Graph() added.load('realfagstermer/added.nt', format='nt') logging.info('Loading realfagstermer.new.nt as <current>') current = Graph() current.load('realfagstermer/realfagstermer.new.nt', format='nt') current.namespace_manager.bind('skos', SKOS) current.namespace_manager.bind('mads', MADS) current.namespace_manager.bind('dct', DCTERMS) logging.info('Computing')
RESULT = Namespace("http://www.w3.org/2002/03owlt/resultsOntology#") FOAF = Namespace("http://xmlns.com/foaf/0.1/") results = Graph() system = BNode("system") results.add((system, FOAF["homepage"], URIRef("http://rdflib.net/"))) results.add((system, RDFS.label, Literal("RDFLib"))) results.add((system, RDFS.comment, Literal(""))) if __name__ == "__main__": manifest = Graph() manifest.load(cached_file( "http://www.w3.org/2000/10/rdf-tests/rdfcore/Manifest.rdf")) import sys import getopt try: optlist, args = getopt.getopt(sys.argv[1:], 'h:', ["help"]) except getopt.GetoptError, msg: write(msg) # usage() try: argv = sys.argv for arg in sys.argv[1:]: verbose = 1 case = URIRef(arg) write(u"Testing: %s" % case) if (case, RDF.type, TEST["PositiveParserTest"]) in manifest: