Exemplo n.º 1
0
 def assure_results(self, graph):
     self.assertEqual(count(), (2, 2))
     self.assertEqual(len(graph), 2)
     self.assertIn((rdflib.term.URIRef('http://example.com/foaf#me'),
                    rdflib.term.URIRef('http://xmlns.com/foaf/0.1/name'),
                    rdflib.term.Literal(u'Max Mustermann')),
         graph.triples((None, None, None))
     )
     self.assertIn((rdflib.term.URIRef('http://example.com/foaf#me'),
                    rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
                    rdflib.term.URIRef('http://xmlns.com/foaf/0.1/Person')),
         graph.triples((None, None, None))
     )
     self.assert_(not my_graph_diff(graph, self.origin.get_graph()))
Exemplo n.º 2
0
    def GET(
        self,
        GRAPH_SIZE_LIMIT=30000,
        only_follow_uris=None,
        handle_owl_imports=False,
        raise_errors=True,
        skip_urls=None,
        httphandler=None,
    ):

        if not self.uri:
            raise Exception("Please provide URI first")

        if skip_urls is not None and self.uri.encode("utf8") in skip_urls:
            self.add_error("Skipped")
            self.processed = True
            return

        logger.info(u"GET %s..." % self.uri)

        if self.has_unsaved_changes():
            if self.processed:
                raise Exception("Please save all changes before querying "
                                "again. Merging not supported yet")
            else:
                logger.warning("There were Resource objects created before "
                               "processing the resource's origin.")

        now = datetime.datetime.now()
        # self.timedelta = datetime.timedelta(minutes=1)
        if hasattr(self, "timedelta") and hasattr(self, 'last_processed'):
            time_since_last_processed = now - self.last_processed
            if (time_since_last_processed < self.timedelta):
                logger.info(
                    "Not processing %s again because was processed only %s ago" % (self.uri, time_since_last_processed))
                return
            self.last_processed = now

        try:
            data = self.backend.GET(self.uri, httphandler=httphandler)
        except urllib2.HTTPError as e:
            if e.code in [
                401,
                403,
                503,  # Service Temporarily Unavailable
                404,  # Not Found
            ]:
                self.add_error(e.code)
            if raise_errors:
                raise e
            else:
                return
        except urllib2.URLError as e:
            self.add_error("timeout")
            if raise_errors:
                raise e
            else:
                return
        except ContentNegotiationError as e:
            logger.error(e.message)
            if raise_errors:
                raise e
            else:
                return

        graph = rdflib.graph.ConjunctiveGraph(identifier=self.uri)

        try:
            if data:
                # Important: Do not pass data=data without publicID=uri because
                # relative URIs (#deri) won't be an absolute uri in that case!
                publicID = self.uri

                reference_time = datetime.datetime.now()

                graph.parse(data=data, publicID=publicID, format=self.backend.format)

                now = datetime.datetime.now()
                self.graph_parse_time = now - reference_time

                # normal rdflib.compare does not work correctly with
                # ConjunctiveGraph, unless there is only one graph within that
        except SAXParseException as e:
            self.add_error("SAXParseException")
            logger.error("SAXParseException: %s" % self)
            if raise_errors:
                raise e
            else:
                return
        except rdflib.exceptions.ParserError as e:
            self.add_error("ParserError")
            logger.error("ParserError: %s" % self)
            if raise_errors:
                raise e
            else:
                return
        except IOError as e:
            self.add_error("IOError")
            logger.error("IOError: %s" % self)
            if raise_errors:
                raise e
            else:
                return

        self.processed = True

        if hasattr(self, "errors"):
            delattr(self, "errors")

        g_length = len(graph)

        if g_length > 0:
            if len(list(graph.contexts())) > 1:
                # detect problems with graph contexts: rdflib can only
                # compare graphs with one context. If a graph has more
                # contexts this might result in wrong comparisons of graphs
                # Still ignored here as ldtools is more robust by doing so.
                logger.error("The graph has more than one context. This"
                             "might cause problems comparing the graphs!")

        if g_length > GRAPH_SIZE_LIMIT:
            logger.error("Maximum graph size exceeded. Thr graph is %s "
                         "triples big. Limit is set to %s. The aquired "
                         "graph exceeds that! Pass GRAPH_SIZE_LIMIT to set it "
                         "differently." % (g_length, GRAPH_SIZE_LIMIT))
            return

        if hasattr(self, "_graph"):
            # we already assured that there are no unsaved_changes
            # --> get_graph() == _graph

            logger.info(u"Already crawled: %s. Comparing graphs..." % self.uri)

            if compare.to_isomorphic(self._graph) ==\
               compare.to_isomorphic(graph):
                return
            else:
                logging.warning("GET retrieved updates for %s!" % self.uri)
                my_graph_diff(self._graph, graph)

                for resource in self.get_resources():
                    resource.delete()
                delattr(self, "handled")

        if hasattr(self, "handled"):
            return

        self._graph = graph

        graph_handler = GraphHandler(
            only_follow_uris=only_follow_uris,
            handle_owl_imports=handle_owl_imports,
            origin=self)
        graph_handler.populate_resources(graph=graph)

        self.handled = True
Exemplo n.º 3
0
 def assure_results(self, graph):
     self.assertEqual(count(), (3, 3))
     self.assertEqual(len(graph), 2)
     self.assert_(not my_graph_diff(graph, self.origin.get_graph()))
Exemplo n.º 4
0
 def assure_results(self, graph):
     self.assertEqual(count(), (5, 8))
     self.assertEqual(len(graph), 5)
     # assert graph == get_graph()
     self.assert_(not my_graph_diff(graph, self.origin.get_graph()))
     self.assert_(self.origin2.processed)