Exemplo n.º 1
0
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [
         url_to_lru_clean(
             "http%s://%s" %
             (https, u.replace('http://', '').replace('https://', '')))
         for u in to_list(args['discover_prefixes']) for https in ['', 's']
     ]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args[
         'phantom'] and args['phantom'].lower() != "false"
     if self.phantom:
         self.ph_timeout = int(
             args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(
             args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(
             args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
     dispatcher.connect(self.closed, spider_closed)
     dispatcher.connect(self.crashed, spider_error)
Exemplo n.º 2
0
 def parse_html(self, response, lru):
     lrulinks = []
     # handle redirects
     realdepth = response.meta['depth']
     if 300 < response.status < 400:
         redir_url = response.headers['Location']
         if redir_url.startswith('/'):
             redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                   redir_url)
         elif redir_url.startswith(
                 './') or not redir_url.startswith('http'):
             redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                   redir_url[1:])
         links = [{'url': redir_url}]
         response.meta['depth'] -= 1
     else:
         links = self.link_extractor.extract_links(response)
     for link in links:
         try:
             url = link.url
         except AttributeError:
             url = link['url']
         try:
             lrulink = url_to_lru_clean(url)
         except ValueError, e:
             self.log("Error converting URL to LRU: %s" % e, log.ERROR)
             continue
         lrulinks.append(lrulink)
         if self._should_follow(response.meta['depth'], lru, lrulink) and \
                 not url_has_any_extension(url, self.ignored_exts):
             yield self._request(url)
Exemplo n.º 3
0
 def parse_html(self, response, lru):
     lrulinks = []
     # handle redirects
     realdepth = response.meta['depth']
     if 300 < response.status < 400:
         redir_url = response.headers['Location']
         if redir_url.startswith('/'):
             redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url)
         elif redir_url.startswith('./') or not redir_url.startswith('http'):
             redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:])
         links = [{'url': redir_url}]
         response.meta['depth'] -= 1
     else:
         try:
             links = self.link_extractor.extract_links(response)
         except Exception as e:
             self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR)
             links = []
             self.errors += 1
     for link in links:
         try:
             url = link.url
         except AttributeError:
             url = link['url']
         try:
             lrulink = url_to_lru_clean(url)
         except ValueError, e:
             self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR)
             continue
         lrulinks.append((url, lrulink))
         if self._should_follow(response.meta['depth'], lru, lrulink) and \
                 not url_has_any_extension(url, self.ignored_exts):
             yield self._request(url)
Exemplo n.º 4
0
 def _new_page(self, url, lru=None):
     if lru is None:
         lru = url_to_lru_clean(url)
     p = Page()
     p['url'] = url
     p['lru'] = lru
     p['timestamp'] = int(time.time() * 1000)
     return p
Exemplo n.º 5
0
 def _new_page(self, url, lru=None):
     if lru is None:
         lru = url_to_lru_clean(url)
     p = Page()
     p['url'] = url
     p['lru'] = lru
     p['timestamp'] = int(time.time()*1000)
     return p
 def process_item(self, item, spider):
     lrulinks = []
     for url, lru in item["lrulinks"]:
         if self._should_resolve(lru, spider):
             try:
                 rurl = yield self.agent.resolve(url)
                 lru = url_to_lru_clean(rurl)
             except Exception, e:
                 spider.log("Error resolving redirects from URL %s: %s %s" % (url, type(e), e), log.INFO)
         lrulinks.append(lru)
Exemplo n.º 7
0
    def handle_response(self, response):
        lru = url_to_lru_clean(response.url, TLDS_TREE)

        if self.phantom:
            self.phantom.get(response.url)

          # Collect whole DOM of the webpage including embedded iframes
            with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

          # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", logging.INFO)
            with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout,
                        self.ph_idle_timeout, self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", logging.INFO)
                except SeleniumTimeout:
                    self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, logging.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err, logging.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), logging.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

      # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request)
                self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction")
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)
Exemplo n.º 8
0
    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

          # Collect whole DOM of the webpage including embedded iframes
            with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

          # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout,
                        self.ph_idle_timeout, self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

      # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request)
                self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction")
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)
Exemplo n.º 9
0
 def parse_html(self, response, lru):
     lrulinks = []
     # handle redirects
     realdepth = response.meta['depth']
     if 300 < response.status < 400:
         redir_url = response.headers['Location']
         if redir_url.startswith('/'):
             redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                   redir_url)
         elif redir_url.startswith('../'):
             lrustart = lru[:lru.rfind('|p:')]
             while redir_url.startswith('../'):
                 lrustart = lrustart[:lrustart.rfind('|p:')]
                 redir_url = redir_url[3:]
             redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url)
         elif redir_url.startswith(
                 './') or not redir_url.startswith('http'):
             redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                   redir_url[1:])
         links = [{'url': redir_url}]
         response.meta['depth'] -= 1
     else:
         try:
             links = self.link_extractor.extract_links(response)
         except Exception as e:
             self.log(
                 "ERROR: links extractor crashed on %s: %s %s" %
                 (response, type(e), e), logging.ERROR)
             links = []
             self.errors += 1
     for link in links:
         try:
             url = link.url
         except AttributeError:
             url = link['url']
         try:
             lrulink = url_to_lru_clean(url, TLDS_TREE)
         except (ValueError, IndexError) as e:
             self.log("Error converting URL %s to LRU: %s" % (url, e),
                      logging.ERROR)
             continue
         lrulinks.append((url, lrulink))
         if self._should_follow(response.meta['depth'], lrulink) and \
                 not url_has_any_extension(url, self.ignored_exts):
             yield self._request(url)
     response.meta['depth'] = realdepth
     yield self._make_html_page(response, lru, lrulinks)
Exemplo n.º 10
0
 def process_item(self, item, spider):
     lrulinks = []
     for url, lru in item["lrulinks"]:
         if self._should_resolve(lru, spider):
             if url in spider.resolved_links:
                 lru = spider.resolved_links[url]
             else:
                 try:
                     agent = ResolverAgent(proxy=self.proxy)
                     rurl = yield agent.resolve(url)
                     if rurl == url and has_prefix(lru, spider.discover_prefixes):
                         rurl = yield agent.resolve(url)
                     lru = url_to_lru_clean(rurl)
                     spider.resolved_links[url] = lru
                 except Exception, e:
                     spider.log("Error resolving redirects from URL %s: %s %s" % (url, type(e), e), log.INFO)
         lrulinks.append(lru)
Exemplo n.º 11
0
 def process_item(self, item, spider):
     lrulinks = []
     for url, lru in item["lrulinks"]:
         if self._should_resolve(lru, spider):
             if url in spider.resolved_links:
                 lru = spider.resolved_links[url]
             else:
                 try:
                     agent = ResolverAgent(proxy=self.proxy)
                     rurl = yield agent.resolve(url)
                     if rurl == url and has_prefix(lru, spider.discover_prefixes):
                         rurl = yield agent.resolve(url)
                     lru = url_to_lru_clean(rurl, TLDS_TREE)
                     spider.resolved_links[url] = lru
                 except Exception, e:
                     spider.log("Error resolving redirects from URL %s: %s %s" % (url, type(e), e), log.INFO)
         lrulinks.append(lru)
Exemplo n.º 12
0
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['maxdepth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [url_to_lru_clean(u) for u in to_list(args['discover_prefixes'])]
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
     if self.phantom:
         self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
     dispatcher.connect(self.closed, spider_closed)
     dispatcher.connect(self.crashed, spider_error)
Exemplo n.º 13
0
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['max_depth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
     self.cookies = None
     if 'cookies' in args:
         self.cookies = dict(cookie.split('=', 1) for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie)
     if self.phantom:
         self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
Exemplo n.º 14
0
 def __init__(self, **kw):
     args = DEFAULT_INPUT.copy()
     args.update(kw)
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['max_depth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
     self.cookies = None
     if 'cookies' in args:
         self.cookies = dict(cookie.split('=') for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie)
     if self.phantom:
         self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
     dispatcher.connect(self.closed, spider_closed)
     dispatcher.connect(self.crashed, spider_error)
Exemplo n.º 15
0
 def __init__(self, **kwargs):
     mongo = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][MONGO_JOBS_COL]
     job = mongo.find_one({"_id": kwargs["job_id"]})
     args = job["crawl_arguments"]
     self.args = args
     self.start_urls = to_list(args['start_urls'])
     self.maxdepth = int(args['max_depth'])
     self.follow_prefixes = to_list(args['follow_prefixes'])
     self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
     self.prefixes_trie = LRUTrie()
     for p in self.follow_prefixes:
         self.prefixes_trie.set_lru(p, True)
     for p in self.nofollow_prefixes:
         self.prefixes_trie.set_lru(p, False)
     self.discover_prefixes = [
         url_to_lru_clean(
             "http%s://%s" %
             (https, u.replace('http://', '').replace('https://', '')),
             TLDS_TREE) for u in to_list(args['discover_prefixes'])
         for https in ['', 's']
     ]
     self.resolved_links = {}
     self.user_agent = args['user_agent']
     self.phantom = 'phantom' in args and args[
         'phantom'] and args['phantom'].lower() != "false"
     self.cookies = None
     if 'cookies' in args and args["cookies"]:
         self.cookies = dict(
             cookie.split('=', 1)
             for cookie in re.split(r'\s*;\s*', args['cookies'])
             if '=' in cookie)
     if self.phantom:
         self.ph_timeout = int(
             args.get('phantom_timeout', PHANTOM['TIMEOUT']))
         self.ph_idle_timeout = int(
             args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
         self.ph_ajax_timeout = int(
             args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
     self.errors = 0
Exemplo n.º 16
0
 def handle_response(self, response):
     lru = url_to_lru_clean(response.url)
     if 300 < response.status < 400 or isinstance(response, HtmlResponse):
         return self.parse_html(response, lru)
     else:
         return self._make_raw_page(response, lru)
Exemplo n.º 17
0
 def handle_response(self, response):
     lru = url_to_lru_clean(response.url)
     if 300 < response.status < 400 or isinstance(response, HtmlResponse):
         return self.parse_html(response, lru)
     else:
         return self._make_raw_page(response, lru)