def create(self, uri, BACKEND=None): uri = get_rdflib_uriref(uri) if not uri == get_slash_url(uri): msg = ("URI passed to Origin Manager was not a slash URI: %s. " "Fixed now." % uri) logger.debug(msg) uri = get_slash_url(uri) backend = BACKEND if BACKEND else RestBackend() origin = super(OriginManager, self).create( pk=uri, uri=uri, backend=backend) return self.post_create_hook(origin)
def get_or_create(self, uri, **kwargs): uri = get_rdflib_uriref(uri) if not uri == get_slash_url(uri): msg = ("URI passed to Origin Manager was not a slash URI: %s. " "Fixed now." % uri) logger.warning(msg) uri = get_slash_url(uri) try: if kwargs: logger.warning("kwargs are ignored for get.") return self.get(uri), False except self.model.DoesNotExist: return self.create(uri, **kwargs), True
def test_hash_or_slash_uri_exceptions(self): test_cases = [ ] for test, result in test_cases: print(urlparse.urlparse(test)) test = rdflib.URIRef(test) result = rdflib.URIRef(result) self.assertEqual(get_slash_url(test), result, msg=test)
def is_authoritative_resource(self): """Definition "authoritative" according to "SAOR: Authoritative Reasoning for the Web" http://www.springerlink.com/content/w47632745gm76x01/""" if isinstance(self._uri, rdflib.BNode): return True if get_slash_url(self._uri) == self._origin.uri: return True
def populate_resources(self, graph): namespace_short_notation_reverse_dict = { unicode(rdflib_url): prefix for rdflib_url, prefix in reverse_dict( safe_dict(dict(graph.namespace_manager.namespaces())) ).items() } reference_time = datetime.datetime.now() for subject, predicate, obj_ect in graph: assert hasattr(subject, "n3") # workaround for rdflib's unicode problems assert predicate.encode('utf8') if self.handle_owl_imports: if (predicate == rdflib.OWL.imports and type(obj_ect) == rdflib.URIRef): uri = get_slash_url(obj_ect) origin, created = Origin.objects.get_or_create(uri=uri) logger.info("Interrupting to process owl:imports %s" "first" % (origin.uri)) origin.GET() if (( self.only_follow_uris is not None and predicate in self.only_follow_uris ) or self.only_follow_uris is None): if type(obj_ect) == rdflib.URIRef: # wrong scheme mailto, tel, callto --> should be Literal? if is_valid_url(obj_ect): obj_uriref = get_slash_url(obj_ect) Origin.objects.get_or_create(uri=obj_uriref) resource, _created = Resource.objects.get_or_create(uri=subject, origin=self.origin) resource._add_property(predicate, obj_ect, namespace_short_notation_reverse_dict) now = datetime.datetime.now() self.origin.graph_handler_time = now - reference_time for resource in self.origin.get_resources(): resource._has_changes = False
def test_hash_or_slash_uri_result(self): test_cases = [ ("http://creativecommons.org/licenses/by-nc/3.0/", "http://creativecommons.org/licenses/by-nc/3.0/"), ("http://www.ifrade.es/#frade", "http://www.ifrade.es/"), ] for test, result in test_cases: print(urlparse.urlparse(test)) test = test result = rdflib.URIRef(result) self.assertEqual(get_slash_url(test), result, msg=test)
def get_authoritative_resource(uri, create_nonexistent_origin=True): """Tries to return the Resource object from the authoritative origin uri""" uri = get_rdflib_uriref(uri) origin_uri = get_slash_url(uri) authoritative_origin = Origin.objects.filter(uri=origin_uri) authoritative_origin_list = list(authoritative_origin) if len(authoritative_origin_list) == 1: origin = authoritative_origin_list[0] else: if create_nonexistent_origin: origin, created = Origin.objects.get_or_create(uri=origin_uri) else: raise Resource.DoesNotExist( "No authoritative Resource found for %s" % uri) if not origin.has_unsaved_changes(): origin.GET(only_follow_uris=[], raise_errors=False) authoritative_resource = Resource.objects.get(uri=uri, origin=origin) return authoritative_resource
def execute_ldtools( verbosity, origin_urls, depth, follow_all, follow_uris, socket_timeout, GRAPH_SIZE_LIMIT, print_all_resources, only_print_uris, only_print_uri_content, only_negotiate ): set_colored_logger(verbosity) # customize Origin.objects.post_create_hook for performance reasons def custom_post_create_hook(origin): origin.timedelta = datetime.timedelta(minutes=5) return origin Origin.objects.post_create_hook = custom_post_create_hook url_count = len(origin_urls) if url_count > 1: logger.info("Retrieving content of %s URLs" % url_count) if follow_all: only_follow_uris = None logging.info("Following all URIs") elif follow_uris: only_follow_uris = follow_uris logging.info("Following values matching: %s" % ", ".join(only_follow_uris)) else: only_follow_uris = [] if socket_timeout: import socket logger.info("Setting socket timeout to %s" % socket_timeout) socket.setdefaulttimeout(socket_timeout) kw = dict(raise_errors=False) if GRAPH_SIZE_LIMIT: kw["GRAPH_SIZE_LIMIT"] = GRAPH_SIZE_LIMIT for url in origin_urls: url = get_slash_url(url) origin, created = Origin.objects.get_or_create(url) logger.info("Retrieving content of %s" % origin.uri) if only_negotiate or only_print_uri_content: try: data = origin.backend.GET( uri=origin.uri, httphandler=urllib2.HTTPHandler(debuglevel=1)) except Exception as exc: print(exc) continue if only_print_uri_content: print('\n', data, '\n') else: origin.GET(only_follow_uris=only_follow_uris, **kw) if only_negotiate or only_print_uri_content: sys.exit(0) if depth: for round in range(depth): for origin in Origin.objects.all(): origin.GET(only_follow_uris=only_follow_uris, **kw) for orig_url in origin_urls: url = get_slash_url(orig_url) origin = Origin.objects.get(url) for r in origin.get_resources(): if r._uri == get_rdflib_uriref(orig_url): logger.info(u"Printing all available information " "about {0}".format(r._uri)) if hasattr(r, "_has_changes"): delattr(r, "_has_changes") if hasattr(r, "pk"): delattr(r, "pk") pprint.pprint(r.__dict__) if print_all_resources: all_resources = Resource.objects.all() if (only_print_uris): for resource in all_resources: print(resource) else: for r in all_resources: if hasattr(r, "_has_changes"): delattr(r, "_has_changes") if hasattr(r, "pk"): delattr(r, "pk") pprint.pprint(r.__dict__)
def get_resource_and_connected_resources(uri): origin_uri = get_slash_url(uri) origin, origin_created = Origin.objects.get_or_create(uri=origin_uri) origin.GET(**GET_kw) res, res_created = Resource.objects.get_or_create(uri=uri, origin=origin) return res