def process_get_response(self, response):
        logger.debug(u"Called {} for {} ".format('process_get_response', self.encoded_url))

        if response.error:
            logger.debug(u"Error processing  get request: {} with error : {}  ( {} )"
                         % (self.encoded_url, response.error, response.reason))
            self.failure_message = response.reason
        else:
            html_source = response.body
            html_source = decode_to_unicode(html_source)
            if self.is_page_internal():
                dom = html.fromstring(html_source)
                # logger.debug("obtained dom object for {}".format(encoded_url))

                link_count = 0
                for href_value in dom.xpath('//a/@href'):
                    href_value = decode_to_unicode(href_value)
                    logger.debug(u"Entering for loop for for {} with href {}".format(self.encoded_url, href_value))
                    self._process_hardcoded_url(href_value)
                    link = self._format_link(href_value)
                    logger.debug(u"obtained link  object{} for {}".format(link, self.encoded_url))

                    if link:
                        parsed_link = obtain_domain_with_subdomain_for_page(link)

                        if parsed_link not in self.domains_to_skip:
                            link_page = TornadoClientPage(link, self, self.base_site, self.base_domain,
                                                          self.domains_to_skip)
                            self.links.add(link_page)
                            link_page.parent = self
                            link_count += 1
        self.finalize_process(self.spider)
 def _format_link(self, href_value):
     href_value = decode_to_unicode(href_value.strip())
     if href_value.startswith('#'):
         link = self.url
     else:
         href_value = href_value.replace("..", "") if href_value.startswith("..") else href_value
         link = urlparse.urljoin(self.url, href_value, allow_fragments=False)
         link = link if 'javascript:void' not in href_value and not href_value.startswith('mailto') else None
     return decode_to_unicode(link)
    def _make_get_request(self):
        logger.debug(u"Called {} for {} ".format('_make_get_request', self.encoded_url))

        request = HTTPRequest(method='GET', url=self.url, request_timeout=PAGE_TIMEOUT, follow_redirects=True,
                              headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 "
                                                     "(KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1"},
                              max_redirects=10)
        try:
            response = yield AsyncHTTPClient().fetch(request)
        except Exception as ex:
            logger.debug(
                u"Error processing get request for : %s with error : %s  " % (self.encoded_url, str(ex.message)))
            self.response_code = ex.code
            self.failure_message = decode_to_unicode(ex.message)
            self.finalize_process(self.spider)
            raise Return(None)

        raise Return(response)
示例#4
0
 def __init__(self, url, parent, base_site, base_domain, domains_to_skip):
     self.url = decode_to_unicode(url) if url is not None else decode_to_unicode('')
     self.encoded_url = decode_to_unicode(self.url)
     self.base_domain = base_domain
     self.response_code = -1
     self.errors = []
     self.links = set()
     self.visited = False
     self.parent = parent
     self.base_site = base_site
     self.content_type = decode_to_unicode("text/html")
     self.domains_to_skip = domains_to_skip
     self.redirect_location = decode_to_unicode('')
     self.hardcoded_urls = set()
     self.failure_message = decode_to_unicode('')
     AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
    def add_sitemap_urls(self, parent_page):
        logger.debug("Adding sitemap urls as well for processing")
        http_client = HTTPClient()
        try:
            response = http_client.fetch(self.sitemap_url)
            val = bytes(response.body)
            root = objectify.fromstring(val)

            for url_element in root.url:
                page = _get_client_page(decode_to_unicode(url_element.loc.text), parent_page, self.base_site,
                                        self.base_domain, DOMAINS_TO_BE_SKIPPED)
                if page not in self.visited_urls and page not in self.non_visited_urls \
                        and page not in self.intermediate_urls:
                    print(u"Added {}".format(url_element.loc))
                    self.non_visited_urls.add(page)
                    self.added_count += 1
                    self.page_queue.put(page)

        except Exception as e:
            logger.error(u"Error adding sitemap urls from %s " % self.sitemap_url)
        finally:
            http_client.close()

if __name__ == "__main__":

    # url ='http://appdynamics.com/blog/2010/09/01/application-virtualization-survey'
    #
    # link_info = extract(url)
    # parsed_link = u"{}.{}.{}".format(link_info.subdomain, link_info.domain, link_info.suffix)
    #
    # for skipped_domain in DOMAINS_TO_BE_SKIPPED:
    # if parsed_link == skipped_domain:
    # pass
    # pass

    args = process_parameters()
    base_url = decode_to_unicode(args.url)
    sitemap_url = decode_to_unicode(args.sitemap_url)
    enable_js_tests = args.testjs
    process_existing_urls = args.process_file
    url_list_file = decode_to_unicode(args.url_file)

    if process_existing_urls:
        if not url_list_file:
            print("Missing file containing  url list, please provide one with --url-file parameter")
            sys.exit(1)
        detect_js_and_resource_issues(url_list_file)
        sys.exit(0)

    scrapper = TornadoSpider(base_url, sitemap_url)
    future = scrapper.initiate_crawl()
    IOLoop.instance().start()