def identify_rdf_parser(self): format = self.rdf_format if format == 'ttl': #parser = RDF.TurtleParser() logger.error( "Turtle is not supported by LODStats, should be converted to ntriples!" ) parser = RDF.NTriplesParser() elif format == 'n3': parser = None raise NameError( "n3 serialization is not supported, please convert to nt") elif format == 'nt': # FIXME: this probably won't do for n3 parser = RDF.NTriplesParser() elif format == 'nq': parser = RDF.Parser(name='nquads') elif format == 'rdf': parser = RDF.Parser(name="rdfxml") elif format == 'sparql': return None elif format == 'sitemap': return None else: raise NameError("unsupported format") return parser
def parse(self, fin): log.debug('Reading RDF from %s' % fin) import RDF parser = RDF.Parser(name=self.parser) stream = parser.parse_as_stream(fin) for triple in stream: self.write(triple.subject, triple.predicate, triple.object, 1)
def main(specloc="file:index.rdf"): """The meat and potatoes: Everything starts here.""" m = RDF.Model() p = RDF.Parser() p.parse_into_model(m, specloc) classlist, proplist = specInformation(m) # Build HTML list of terms. azlist = buildazlist(classlist, proplist) # Generate Term HTML termlist = "<h3>Classes and Properties (full detail)</h3>" termlist += "<div class='termdetails'>" termlist += docTerms('Class', classlist, m) termlist += docTerms('Property', proplist, m) termlist += "</div>" # Generate RDF from original namespace. u = urllib.urlopen(specloc) rdfdata = u.read() rdfdata.replace("""<?xml version="1.0"?>""", "") # wip.template is a template file for the spec, python-style % escapes # for replaced sections. f = open("../0.1/template.html", "r") template = f.read() print template % (azlist.encode("utf-8"), termlist.encode("utf-8"), rdfdata)
def init_stream_from_string(self, string_rdf, base_uri, parser_name="rdfxml"): rdf_parser = RDF.Parser(name=parser_name) stream = rdf_parser.parse_string_as_stream(string_rdf, base_uri) return stream
def posts(self, *args): alltags = Set() for arg in args: if isinstance(arg, Tag): alltags.add(arg) url = "http://del.icio.us/rss/" + self.user if len(alltags) > 0: url += "/" + "+".join([str(tag) for tag in alltags]) model = RDF.Model() parser = RDF.Parser() try: parser.parse_string_into_model(model, get_url_contents(url), RDF.Uri("http://foo")) posts = [ RSSTagPost(model, p.subject) for p in model.find_statements( RDF.Statement( None, RDF.Uri( "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), RDF.Uri("http://purl.org/rss/1.0/item"))) ] for post in posts: post.user = self return posts except: return []
def parse_handle_to_model(self, rooted=False, storage=None, parse_format='turtle', context=None, **kwargs): '''Parse self.handle into RDF model self.model.''' if storage is None: # store RDF model in memory for now storage = new_storage() if self.model is None: self.model = RDF.Model(storage) if self.model is None: raise CDAOError("new RDF.model failed") model = self.model self.rooted = rooted parser = RDF.Parser(name=parse_format) if parser is None: raise Exception('Failed to create RDF.Parser for MIME type %s' % mime_type) if 'base_uri' in kwargs: base_uri = kwargs['base_uri'] else: base_uri = RDF.Uri(string="file://"+os.path.abspath(self.handle.name)) statements = parser.parse_string_as_stream(self.handle.read(), base_uri) for s in statements: model.append(s) return self.parse_model(model, context=context)
def posts(self,*args): alltags = Set() extratags = "" user = None for arg in args: if isinstance(arg,Tag): alltags.add(arg) if isinstance(arg,User): user = arg if len(alltags)>0: extratags = "+"+"+".join([str(tag) for tag in alltags]) if user is not None: url = "http://del.icio.us/rss/"+str(user)+"/"+self.name+extratags else: url = "http://del.icio.us/rss/tag/"+self.name model = RDF.Model() parser = RDF.Parser() try: parser.parse_string_into_model(model,get_url_contents(url),RDF.Uri("http://foo")) posts = [RSSTagPost(model,p.subject,self) for p in model.find_statements(RDF.Statement(None,RDF.Uri("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),RDF.Uri("http://purl.org/rss/1.0/item")))] if user is not None: for post in posts: post.user = user return posts except: return []
def posts(): url = "http://del.icio.us/rss/" model = RDF.Model() parser = RDF.Parser() parser.parse_string_into_model(model,get_url_contents(url),RDF.Uri("http://foo")) posts = [RSSTagPost(model,p.subject) for p in model.find_statements(RDF.Statement(None,RDF.Uri("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),RDF.Uri("http://purl.org/rss/1.0/item")))] return posts
def make_query(rdf, query): model = RDF.Model() parser = RDF.Parser() parser.parse_into_model(model, rdf) sparql = """ %s %s""" % (PREFIX, query) q = RDF.Query(sparql, query_language="sparql") return q.execute(model)
def modeltest(): from IPython import embed # this hardlocks ms = RDF.MemoryStorage('test') m = RDF.Model(ms) p1 = Path('~/git/NIF-Ontology/ttl/NIF-Molecule.ttl').expanduser() p = RDF.Parser(name='turtle') p.parse_into_model(m, p1.as_uri()) embed()
def specgen(self, mode="spec"): """The meat and potatoes: Everything starts here.""" m = RDF.Model() p = RDF.Parser() try: p.parse_into_model(m, self.specloc) except IOError, e: raise Usage("Error reading from ontology: %s" % str(e))
def _initOntology(self): #storage = RDF.HashStorage('dbpedia', options="hash-type='bdb'") storage = RDF.MemoryStorage() model = RDF.Model(storage) rdfParser = RDF.Parser(name="rdfxml") ontologyPath = 'file://' + os.path.join(self._getCurrentDir(), 'dbpedia_3.9.owl') rdfParser.parse_into_model(model, ontologyPath, "http://example.org/") return model
def get_parser(url, format=None): if format is None: format = get_format(url) if format == 'ttl': parser = RDF.TurtleParser() elif format == 'nt' or format == 'n3': # FIXME: this probably won't do for n3 parser = RDF.NTriplesParser() elif format == 'nq': parser = RDF.Parser(name='nquads') elif format == 'rdf': parser = RDF.Parser(name="rdfxml") elif format == 'sparql': return None elif format == 'sitemap': return None else: raise NameError("unsupported format") return parser
def make_query(self, rdf, query): """Make sparql query.""" model = RDF.Model() parser = RDF.Parser() parser.parse_into_model(model, rdf) sparql = """ %s %s""" % (self.rdf_prefix, query) q = RDF.Query(sparql, query_language="sparql") return q.execute(model)
def __init__(self, globalities={}, localities={}): self.parser = RDF.Parser('raptor') if self.parser is None: raise Exception("Failed to create RDF.Parser raptor") self.localities = localities self.globalities = globalities self.questions = {} self.answers = {} self.root = None
def _rdfxml_to_ntriples(self, data): # Ntriples syntax is not supported by allegro graph # as a result format for SPARQL Construct Queries # HACK workaround using redland import RDF model = RDF.Model() parser = RDF.Parser() try: parser.parse_string_into_model(model, data.read(), '-') except RDF.RedlandError, err: raise TripleStoreError(err)
def bootstrap(self, filename): file = open(filename, "r") parser = RDF.Parser(name="turtle") status = parser.parse_string_into_model( self.model, file.read(), "http://example.com/bootstrap") file.close() if not status: raise RDF.RedlandError("Error parsing bootstrapping file.") else: namespaces = parser.namespaces_seen() self.addNamespaces(namespaces) return self
def _ntriples_to_turtle(self, data): # Turtle syntax is not supported by allegro graph # HACK workaround using redland import RDF model = RDF.Model() parser = RDF.Parser('ntriples') data = data.read() data = (data.strip() + '\n') try: parser.parse_string_into_model(model, data, '-') except RDF.RedlandError, err: raise TripleStoreError(err)
def _parse(self, file, format, base_uri=None): if format == 'turtle': parser = RDF.TurtleParser() else: parser = RDF.Parser(format) if isinstance(base_uri, unicode): base_uri = base_uri.encode('utf8') data = file.read() file.close() try: stream = parser.parse_string_as_stream(data, base_uri) except RDF.RedlandError, err: raise TripleStoreError(err)
def specgen(specloc, template, instances=False, mode="spec"): """The meat and potatoes: Everything starts here.""" global spec_url global spec_ns global ns_list m = RDF.Model() p = RDF.Parser() try: p.parse_into_model(m, specloc) except IOError, e: print "Error reading from ontology:", str(e) usage()
def getAggregatedIdentifiers(identifier): """Retrieves and parses the resource map with the specified identifier. Returns: List(str) List of identifiers. """ if type(identifier) is not str or len(identifier) < 1: raise Exception("Bad identifier string passed to method.") model = RDF.Model() parser = RDF.Parser(name="rdfxml") base_url = "https://cn.dataone.org/cn/v1/object/" query_url = base_url + urllib.quote_plus(identifier) try: parser.parse_into_model(model, query_url) except RDF.RedlandError as e: print "Exception: Failed to parse RDF/XML at `%s`: %s" % (query_url, e) query = """ SELECT ?s ?o WHERE { ?s <http://www.openarchives.org/ore/terms/aggregates> ?o } """ q = RDF.Query(query) identifiers = [] for result in q.execute(model): if 'o' not in result: continue object_node = result['o'] if object_node.is_resource(): object_node_str = str(object_node) identifier = extractIdentifierFromFullURL(object_node_str) if identifier is not None: identifiers.append(identifier) return identifiers
def load_rdf(self): mtime = os.path.getmtime(self.filename) if self.model is not None and mtime <= self.modelMtime: return self.modelMtime = mtime log.info("loading rdf from %r" % self.filename) self.model = RDF.Model(RDF.MemoryStorage()) u = RDF.Uri("file:%s" % self.filename) try: for s in RDF.Parser('turtle').parse_as_stream(u): self.model.add_statement(s) except (Exception, ), e: # e.__class__.__module__ is "RDF", not the real module! if e.__class__.__name__ != "RedlandError": raise raise ValueError("Error parsing %s: %s" % (u, e))
def _ntriples_to_turtle(self, data): # Turtle syntax is not supported by allegro graph # HACK workaround using redland import RDF model = RDF.Model() parser = RDF.Parser('ntriples') data = data.read() data = (data.strip() + '\n') try: parser.parse_string_into_model(model, data, '-') except RDF.RedlandError as err: raise TripleStoreError(err) serializer = RDF.Serializer(name='turtle') for prefix, ns in self._nsmap.items(): serializer.set_namespace(prefix, ns) return StringIO(serializer.serialize_model_to_string(model))
def main(specloc, template, mode="spec"): """The meat and potatoes: Everything starts here.""" m = RDF.Model() p = RDF.Parser() p.parse_into_model(m, specloc) classlist, proplist = specInformation(m) if mode == "spec": # Build HTML list of terms. azlist = buildazlist(classlist, proplist) elif mode == "list": # Build simple <ul> list of terms. azlist = build_simple_list(classlist, proplist) # Generate Term HTML # termlist = "<h3>Classes and Properties (full detail)</h3>" termlist = docTerms('Class', classlist, m) termlist += docTerms('Property', proplist, m) # Generate RDF from original namespace. u = urllib.urlopen(specloc) rdfdata = u.read() rdfdata = re.sub(r"(<\?xml version.*\?>)", "", rdfdata) rdfdata = re.sub(r"(<!DOCTYPE[^]]*]>)", "", rdfdata) # rdfdata.replace("""<?xml version="1.0"?>""", "") # print template % (azlist.encode("utf-8"), termlist.encode("utf-8"), rdfdata.encode("ISO-8859-1")) template = re.sub(r"^#format \w*\n", "", template) template = re.sub(r"\$VersionInfo\$", owlVersionInfo(m).encode("utf-8"), template) # NOTE: This works with the assumtpion that all "%" in the template are escaped to "%%" and it # contains the same number of "%s" as the number of parameters in % ( ...parameters here... ) template = template % (azlist.encode("utf-8"), termlist.encode("utf-8")) template += "<!-- specification regenerated at " + time.strftime( '%X %x %Z') + " -->" return template
def main(): reload(sys) sys.setdefaultencoding("UTF8") success = checkArgs() if not success: print "Usage: python dbpediaNeo4j.py /full/path/filename.nt" sys.exit(1) # create dbpedia-graph.db db, index = createDB() counter = 0.0 file_lines = int((subprocess.check_output(['wc', '-l', sys.argv[1]])).split()[0]) # RDF parses dbpedia ntriples dump parser = RDF.Parser("ntriples") stream = parser.parse_as_stream("file://" + sys.argv[1]) print startTime = datetime.now() # start parsing for triple in stream: # extract nodes and relationship a = str(triple.subject).split('/')[-1] r = str(triple.predicate).split('/')[-1] b = str(triple.object).split('/')[-1] createNodes(db, index, a, b, r) counter += 1 # print updated percentage if (counter % 100) == 0: perc = (counter / file_lines) * 100 sys.stdout.write("\rProgress: %d%%" % perc) sys.stdout.flush() # Shutdown db db.shutdown() endTime = datetime.now() print "\nFinished - %d relationships imported in %d seconds" % (counter, ( endTime - startTime).seconds) print "Move %s/dbpedia-graph.db to your Neo4j data directory ;-)" % os.getcwd( ) return
def read(self, path, fmt='', base_uri=None): if fmt == '': fmt = self._guess_fmt(path) gzipped = False if self._gzipped(path): gzipped = True tmp = self._mktemp() self._gunzip(path, tmp) path = tmp parser = RDF.Parser(name=fmt) logger.info('reading "%s"...' % path) parser.parse_into_model(self._model, 'file://' + os.path.abspath(path), base_uri=base_uri) logger.info('done.') if gzipped: os.unlink(tmp)
def init_model(*filenames): """Input: An on-disk path (filenames) to start from. Output: A model with those suckers parsed.""" for filename in filenames: # filenames, not URIs die_unless( ':/' not in filename, "You passed in something that " + "looks like a URI; blowing up") storage = RDF.Storage( storage_name="hashes", name="test", options_string="new='yes',hash-type='memory',dir='.'") if storage is None: raise "new RDF.Storage failed" model = RDF.Model(storage) if model is None: raise "new RDF.Model failed" parser = RDF.Parser('raptor') for filename in filenames: filename_uri = RDF.Uri(string="file:" + filename) parser.parse_into_model(model, filename_uri) return model
def parse(self, source, sink, **args): source.close() file_uri = source.getPublicId() parser = RDF.Parser(name=self.format) stream = parser.parse_as_stream(file_uri) [sink.add(statement_to_triple(statement)) for statement in stream]
def main(): from IPython import embed """ Python 3.6.6 ibttl 2.605194091796875 ttl 3.8316309452056885 diff lt - ttl -1.2264368534088135 librdfxml 31.267616748809814 rdfxml 58.25124502182007 diff lr - rl -26.983628273010254 simple time 17.405116319656372 """ """ Python 3.5.3 (pypy3) libttl 2.387338638305664 ttl 1.3430471420288086 diff lt - ttl 1.0442914962768555 librdfxml 24.70371127128601 rdfxml 17.85916304588318 diff lr - rl 6.844548225402832 simple time 18.32300615310669 """ # well I guess that answers that question ... # librdf much faster for cpython, not for pypy3 from time import time rdflib.plugin.register('librdfxml', rdflib.parser.Parser, 'librdflib', 'libRdfxmlParser') rdflib.plugin.register('libttl', rdflib.parser.Parser, 'librdflib', 'libTurtleParser') p1 = Path('~/git/NIF-Ontology/ttl/NIF-Molecule.ttl').expanduser() start = time() graph = rdflib.Graph().parse(p1.as_posix(), format='libttl') stop = time() lttime = stop - start print('libttl', lttime) #serialize(graph) start = time() graph = rdflib.Graph().parse(p1.as_posix(), format='turtle') stop = time() ttltime = stop - start print('ttl', ttltime) print('diff lt - ttl', lttime - ttltime) p2 = Path('~/git/NIF-Ontology/ttl/external/uberon.owl').expanduser() start = time() graph2 = rdflib.Graph().parse(p2.as_posix(), format='librdfxml') stop = time() lrtime = stop - start print('librdfxml', lrtime) if True: start = time() graph2 = rdflib.Graph().parse(p2.as_posix(), format='xml') stop = time() rltime = stop - start print('rdfxml', rltime) print('diff lr - rl', lrtime - rltime) if True: file_uri = p2.as_uri() parser = RDF.Parser(name='rdfxml') stream = parser.parse_as_stream(file_uri) start = time() # t = list(stream) t = tuple(statement_to_tuple(statement) for statement in stream) stop = time() stime = stop - start print('simple time', stime) embed()
def _get_statement_stream(self): parser = RDF.Parser(name=self._format) return parser.parse_as_stream(self._path_to_rdf)