def load_context(url): """ A self-aware document loader. For those contexts MediaGoblin stores internally, load them from disk. """ if url in _CONTEXT_CACHE: return _CONTEXT_CACHE[url] # See if it's one of our basic ones document = BUILTIN_CONTEXTS.get(url, None) # No? See if we have an internal schema for this if document is None: document = hook_handle(("context_url_data", url)) # Okay, if we've gotten a document by now... let's package it up if document is not None: document = {'contextUrl': None, 'documentUrl': url, 'document': document} # Otherwise, use jsonld.load_document else: document = jsonld.load_document(url) # cache _CONTEXT_CACHE[url] = document return document
def load_context(url): """ A self-aware document loader. For those contexts MediaGoblin stores internally, load them from disk. """ if url in _CONTEXT_CACHE: return _CONTEXT_CACHE[url] # See if it's one of our basic ones document = BUILTIN_CONTEXTS.get(url, None) # No? See if we have an internal schema for this if document is None: document = hook_handle(("context_url_data", url)) # Okay, if we've gotten a document by now... let's package it up if document is not None: document = { 'contextUrl': None, 'documentUrl': url, 'document': document } # Otherwise, use jsonld.load_document else: document = jsonld.load_document(url) # cache _CONTEXT_CACHE[url] = document return document
def ema_url_from_jsonld(jsonld_filename): """ Takes a .jsonld filename and extracts the full EMA request URL. """ nanopub = jsonld.load_document(f"{NANOPUB_URL}/{jsonld_filename}") for graph in nanopub["document"]: for item in graph["@graph"]: if W3C_HAS_SRC in item: return item[W3C_HAS_SRC][0]['@id'] return None
def loader(url): if url in _url_map: return _url_map[url] elif load_unknown_urls: doc = jsonld.load_document(url) # @@: Is this optimization safe in all cases? if isinstance(doc["document"], str): doc["document"] = json.loads(doc["document"]) if cache_externally_loaded: _url_map[url] = doc return doc else: raise jsonld.JsonLdError( "url not found and loader set to not load unknown URLs.", {'url': url})
def cached_load_document(url): """Read local cached copy of URL if available, else fallback to network.""" filepath = in_cache(url) if (filepath is None): logging.debug("Using default loader to get %s" % (url)) return(jsonld.load_document(url)) else: logging.debug("Reading %s from %s" % (url, filepath)) data = open(filepath, 'r').read() doc = { 'contextUrl': None, 'documentUrl': None, 'document': data } return doc
def load_document(url): """Retrieves JSON-LD for the given URL from a local file if available, and falls back to the network. """ files = { AnnotationWriter.JSONLD_CONTEXT: "anno.jsonld", AnnotationWriter.LDP_CONTEXT: "ldp.jsonld", } if url in files: base_path = os.path.join(os.path.split(__file__)[0], "jsonld") jsonld_file = os.path.join(base_path, files[url]) data = open(jsonld_file).read() doc = {"contextUrl": None, "documentUrl": url, "document": data} return doc else: return jsonld.load_document(url)
def lookup_by_identifier(self, identifier, processed_uris=set()): """Turn an Identifier into a JSON-LD document.""" if identifier.type == Identifier.OCLC_WORK: foreign_type = 'work' url = self.WORK_BASE_URL elif identifier.type == Identifier.OCLC_NUMBER: foreign_type = "oclc" url = self.BASE_URL url = url % dict(id=identifier.identifier, type=foreign_type) if url in processed_uris: self.log.debug("SKIPPING %s, already processed.", url) return None, True processed_uris.add(url) representation, cached = Representation.get(self._db, url) try: data = jsonld.load_document(url) except Exception, e: self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e) return None, False
def load_document(url): """Retrieves JSON-LD for the given URL from a local file if available, and falls back to the network. """ files = { AnnotationWriter.JSONLD_CONTEXT: "anno.jsonld", AnnotationWriter.LDP_CONTEXT: "ldp.jsonld" } if url in files: base_path = os.path.join(os.path.split(__file__)[0], 'jsonld') jsonld_file = os.path.join(base_path, files[url]) data = open(jsonld_file).read() doc = { "contextUrl": None, "documentUrl": url, "document": data.decode('utf-8') } return doc else: return jsonld.load_document(url)
def get_jsonld(self, url): representation, cached = Representation.get(self._db, url) try: data = jsonld.load_document(url) except Exception as e: self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e) return None, False if cached and not representation.content: representation, cached = Representation.get( self._db, url, max_age=0) if not representation.content: return None, False doc = { 'contextUrl': None, 'documentUrl': url, 'document': representation.content.decode('utf8') } return doc, cached
def get_jsonld(self, url): representation, cached = Representation.get(self._db, url) try: data = jsonld.load_document(url) except Exception as e: self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e) return None, False if cached and not representation.content: representation, cached = Representation.get(self._db, url, max_age=0) if not representation.content: return None, False doc = { 'contextUrl': None, 'documentUrl': url, 'document': representation.content.decode('utf8') } return doc, cached
def _cached_load_document(url): """Loader of pyld document from a url, which caches loaded instance on disk """ doc_fname = _get_schema_url_cache_filename(url) doc = None if os.path.exists(doc_fname): try: lgr.debug("use cached request result to '%s' from %s", url, doc_fname) doc = pickle.load(open(doc_fname, 'rb')) except Exception as e: # it is OK to ignore any error and fall back on the true source lgr.warning( "cannot load cache from '%s', fall back on schema download: %s", doc_fname, exc_str(e)) if doc is None: from pyld.jsonld import load_document doc = load_document(url) assure_dir(dirname(doc_fname)) # use pickle to store the entire request result dict pickle.dump(doc, open(doc_fname, 'wb')) lgr.debug("stored result of request to '{}' in {}".format(url, doc_fname)) return doc
def _fetch_context(self, active_ctx, url, cycles): # check for max context URLs fetched during a resolve operation if len(cycles) > MAX_CONTEXT_URLS: raise jsonld.JsonLdError( 'Maximum number of @context URLs exceeded.', 'jsonld.ContextUrlError', {'max': MAX_CONTEXT_URLS}, code=('loading remote context failed' if active_ctx.get('processingMode') == 'json-ld-1.0' else 'context overflow')) # check for context URL cycle # shortcut to avoid extra work that would eventually hit the max above if url in cycles: raise jsonld.JsonLdError( 'Cyclical @context URLs detected.', 'jsonld.ContextUrlError', {'url': url}, code=('recursive context inclusion' if active_ctx.get('processingMode') == 'json-ld-1.0' else 'context overflow')) # track cycles cycles.add(url) try: remote_doc = jsonld.load_document(url, {'documentLoader': self.document_loader}, requestProfile='http://www.w3.org/ns/json-ld#context') context = remote_doc.get('document', url) except Exception as cause: raise jsonld.JsonLdError( 'Dereferencing a URL did not result in a valid JSON-LD object. ' + 'Possible causes are an inaccessible URL perhaps due to ' + 'a same-origin policy (ensure the server uses CORS if you are ' + 'using client-side JavaScript), too many redirects, a ' + 'non-JSON response, or more than one HTTP Link Header was ' + 'provided for a remote context.', 'jsonld.InvalidUrl', {'url': url, 'cause': cause}, code='loading remote context failed') # ensure ctx is an object if not isinstance(context, dict) and not isinstance(context, frozendict): raise jsonld.JsonLdError( 'Dereferencing a URL did not result in a JSON object. The ' + 'response was valid JSON, but it was not a JSON object.', 'jsonld.InvalidUrl', {'url': url}, code='invalid remote context') # use empty context if no @context key is present if '@context' not in context: context = {'@context': {}} else: context = {'@context': context['@context']} # append @context URL to context if given if remote_doc['contextUrl']: if not isinstance(context['@context'], list): context['@context'] = [context['@context']] context['@context'].append(remote_doc['contextUrl']) return (context, remote_doc)