예제 #1
0
    def site_spider(self, root_url):
        """
        spider every page of the site we can find, report back
        with links found and their details
        """

        response = o.SpiderResponse(url=root_url)
        response.pages = []

        # starting @ the root spider all the sites we can find w/in
        # the domain
        links = self.link_spider(root_url, 1000, True)

        # all that data is nice and cached so we can reprocess it
        for link in links + [root_url]:
            page = o.Page(url=link)
            page.links = self.get_links(link)
            page.images = self.get_images(link)
            try:
                with srvs_connect(Requester) as c:
                    r = c.urlopen(ro.Request(link))
                page.response = r
            except o.Exception, ex:
                # problem w/ response = no response
                print "o.request exception: %s %s" % (link, ex.msg)
            except Exception, ex:
                print "request exception: %s %s" % (link, ex)
예제 #2
0
    def site_spider(self, root_url):
        """
        spider every page of the site we can find, report back
        with links found and their details
        """

        response = o.SpiderResponse(url=root_url)
        response.pages = []

        # starting @ the root spider all the sites we can find w/in
        # the domain
        links = self.link_spider(root_url, 1000, True)

        # all that data is nice and cached so we can reprocess it
        for link in links + [root_url]:
            page = o.Page(url=link)
            page.links = self.get_links(link)
            page.images = self.get_images(link)
            try:
                with srvs_connect(Requester) as c:
                    r = c.urlopen(ro.Request(link))
                page.response = r
            except o.Exception, ex:
                # problem w/ response = no response
                print 'o.request exception: %s %s' % (link, ex.msg)
            except Exception, ex:
                print 'request exception: %s %s' % (link, ex)
예제 #3
0
    def get_links(self, url):
        """ returns back the href for all links on page """

        url = url.strip()
        print "get_links: %s" % url

        # if it's an image forget it
        if url.lower().endswith(self.not_html_ext):
            return []

        # request the url
        try:
            with srvs_connect(Requester) as c:
                r = c.urlopen(ro.Request(url))
            if not r:
                return []
        except o.Exception, ex:
            raise o.Exception("o.Could not make request: %s %s" % (url, ex))
예제 #4
0
    def get_links(self, url):
        """ returns back the href for all links on page """

        url = url.strip()
        print 'get_links: %s' % url

        # if it's an image forget it
        if url.lower().endswith(self.not_html_ext):
            return []

        # request the url
        try:
            with srvs_connect(Requester) as c:
                r = c.urlopen(ro.Request(url))
            if not r:
                return []
        except o.Exception, ex:
            raise o.Exception('o.Could not make request: %s %s' % (url, ex))
예제 #5
0
    def get_images(self, url):
        """ returns back the src for all images on page """

        url = url.strip()
        print "get_images: %s" % url

        # only care to parse html pages
        if url.lower().endswith(self.not_html_ext):
            return []

        # request the url
        try:
            print "get image making request: %s" % url
            with srvs_connect(Requester) as c:
                r = c.urlopen(ro.Request(url))
            if not r:
                print "get image no response: %s" % url
                return []
        except o.Exception, ex:
            print "ex"
            raise o.Exception("o.Could not make request: %s %s" % (url, ex))
예제 #6
0
    def get_images(self, url):
        """ returns back the src for all images on page """

        url = url.strip()
        print 'get_images: %s' % url

        # only care to parse html pages
        if url.lower().endswith(self.not_html_ext):
            return []

        # request the url
        try:
            print 'get image making request: %s' % url
            with srvs_connect(Requester) as c:
                r = c.urlopen(ro.Request(url))
            if not r:
                print 'get image no response: %s' % url
                return []
        except o.Exception, ex:
            print 'ex'
            raise o.Exception('o.Could not make request: %s %s' % (url, ex))