def zeronet_bootstrap() -> None: """Bootstrap wrapper for ZeroNet. The function will bootstrap the ZeroNet proxy. It will retry for :data:`~darc.proxy.zeronet.ZERONET_RETRY` times in case of failure. Also, it will **NOT** re-bootstrap the proxy as is guaranteed by :data:`~darc.proxy.zeronet._ZERONET_BS_FLAG`. Warns: ZeroNetBootstrapFailed: If failed to bootstrap ZeroNet proxy. Raises: :exc:`UnsupportedPlatform`: If the system is not supported, i.e. not macOS or Linux. See Also: * :func:`darc.proxy.zeronet._zeronet_bootstrap` * :data:`darc.proxy.zeronet.ZERONET_RETRY` * :data:`darc.proxy.zeronet._ZERONET_BS_FLAG` """ # don't re-bootstrap if _ZERONET_BS_FLAG: return logger.info('-*- ZeroNet Bootstrap -*-') for _ in range(ZERONET_RETRY+1): try: _zeronet_bootstrap() break except Exception: if DEBUG: logger.ptb('[Error bootstraping ZeroNet proxy]') logger.pexc(LOG_WARNING, category=ZeroNetBootstrapFailed, line='zeronet_bootstrap()') logger.pline(LOG_INFO, logger.horizon)
def have_hostname(link: 'Link') -> 'Tuple[bool, bool]': """Check if current link is a new host. Args: link: Link to check against. Returns: A tuple of two :obj:`bool` values representing if such link is a known host and needs force refetch respectively. See Also: * :func:`darc.db._have_hostname_db` * :func:`darc.db._have_hostname_redis` """ if FLAG_DB: with database.connection_context(): try: return _have_hostname_db(link) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_have_hostname_db({link.url})') return False, False return _have_hostname_redis(link)
def _db_operation(operation: 'Callable[..., _T]', *args: 'Any', **kwargs: 'Any') -> '_T': """Retry operation on database. Args: operation: Callable / method to perform. *args: Arbitrary positional arguments. Keyword Args: **kwargs: Arbitrary keyword arguments. Returns: Any return value from a successful ``operation`` call. """ _arg_msg = None while True: try: value = operation(*args, **kwargs) except peewee.PeeweeException: if _arg_msg is None: _arg_msg = _gen_arg_msg(*args, **kwargs) model = cast('MethodType', operation).__self__.__class__.__name__ logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'{model}.{operation.__name__}({_arg_msg})') if RETRY_INTERVAL is not None: time.sleep(RETRY_INTERVAL) continue break return value
def tor_bootstrap() -> None: """Bootstrap wrapper for Tor. The function will bootstrap the Tor proxy. It will retry for :data:`~darc.proxy.tor.TOR_RETRY` times in case of failure. Also, it will **NOT** re-bootstrap the proxy as is guaranteed by :data:`~darc.proxy.tor._TOR_BS_FLAG`. Warns: TorBootstrapFailed: If failed to bootstrap Tor proxy. See Also: * :func:`darc.proxy.tor._tor_bootstrap` * :data:`darc.proxy.tor.TOR_RETRY` * :data:`darc.proxy.tor._TOR_BS_FLAG` """ # don't re-bootstrap if _TOR_BS_FLAG: return logger.info('-*- Tor Bootstrap -*-') for _ in range(TOR_RETRY + 1): try: _tor_bootstrap() break except Exception: if DEBUG: logger.ptb('[Error bootstraping Tor proxy]') logger.pexc(LOG_WARNING, category=TorBootstrapFailed, line='tor_bootstrap()') logger.pline(LOG_INFO, logger.horizon)
def _check_ng(temp_list: 'List[darc_link.Link]') -> 'List[darc_link.Link]': """Check content type of links through ``HEAD`` requests. Args: temp_list: List of links to be checked. Returns: List of links matches the requirements. See Also: * :func:`darc.parse.match_host` * :func:`darc.parse.match_proxy` * :func:`darc.parse.match_mime` """ from darc.crawl import request_session # pylint: disable=import-outside-toplevel session_map = {} # type: Dict[str, FuturesSession] result_list = [] for link in temp_list: if match_host(link.host): continue if match_proxy(link.proxy): continue # get session session = session_map.get(link.proxy) if session is None: session = request_session(link, futures=True) session_map[link.proxy] = session result = session.head(link.url, allow_redirects=True) result_list.append(result) logger.info('[HEAD] Checking content type from %s', link.url) link_list = [] for result in concurrent.futures.as_completed(result_list): # type: ignore try: response = result.result() # type: Response except requests.RequestException as error: if error.response is None: logger.pexc(message='[HEAD] Checking failed') continue logger.pexc(message=f'[HEAD] Failed on {error.response.url}') link_list.append(error.response.url) continue ct_type = get_content_type(response) logger.info('[HEAD] Checked content type from %s (%s)', response.url, ct_type) if match_mime(ct_type): continue temp_link = parse_link(response.request.url) # type: ignore link_list.append(temp_link) return link_list
def fetch_hosts(link: 'darc_link.Link', force: bool = False) -> None: """Fetch ``hosts.txt``. Args: link: Link object to fetch for its ``hosts.txt``. force: Force refetch ``hosts.txt``. Returns: Content of the ``hosts.txt`` file. """ if force: logger.warning('[HOSTS] Force refetch %s', link.url) hosts_path = None if force else have_hosts(link) if hosts_path is not None: logger.warning('[HOSTS] Cached %s', link.url) # pylint: disable=no-member with open(hosts_path) as hosts_file: hosts_text = hosts_file.read() else: from darc.requests import i2p_session # pylint: disable=import-outside-toplevel hosts_link = parse_link(urljoin(link.url, '/hosts.txt'), backref=link) logger.info('[HOSTS] Subscribing %s', hosts_link.url) with i2p_session() as session: try: response = session.get(hosts_link.url) except requests.RequestException: logger.pexc(message=f'[HOSTS] Failed on {hosts_link.url}') return if not response.ok: logger.error('[HOSTS] Failed on %s [%d]', hosts_link.url, response.status_code) return ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: logger.error('[HOSTS] Unresolved content type on %s (%s)', hosts_link.url, ct_type) return hosts_text = response.text save_hosts(hosts_link, hosts_text) logger.info('[HOSTS] Subscribed %s', hosts_link.url) from darc.db import save_requests # pylint: disable=import-outside-toplevel # add link to queue save_requests(read_hosts(link, hosts_text))
def renew_tor_session() -> None: """Renew Tor session.""" global _TOR_CTRL # pylint: disable=global-statement try: # Tor controller process if _TOR_CTRL is None: _TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL)) _TOR_CTRL.authenticate(TOR_PASS) _TOR_CTRL.signal(stem.Signal.NEWNYM) # pylint: disable=no-member except Exception: logger.pexc( LOG_WARNING, category=TorRenewFailed, line= '_TOR_CTRL = stem.control.Controller.from_port(port=int(TOR_CTRL))' )
def save_requests(entries: 'Union[Link, List[Link]]', single: bool = False, score: 'Optional[float]' = None, nx: bool = False, xx: bool = False) -> None: """Save link to the :mod:`requests` database. The function updates the ``queue_requests`` database. Args: entries: Links to be added to the :mod:`requests` database. It can be either a :obj:`list` of links, or a single link string (if ``single`` set as :data:`True`). single: Indicate if ``entries`` is a :obj:`list` of links or a single link string. score: Score to for the Redis sorted set. nx: Only create new elements and not to update scores for elements that already exist. xx: Only update scores of elements that already exist. New elements will not be added. Notes: The ``entries`` will be dumped through :mod:`pickle` so that :mod:`darc` do not need to parse them again. When ``entries`` is a list of :class:`~darc.link.Link` instances, we tries to perform *bulk* update to easy the memory consumption. The *bulk* size is defined by :data:`~darc.db.BULK_SIZE`. See Also: * :func:`darc.db._save_requests_db` * :func:`darc.db._save_requests_redis` """ if FLAG_DB: with database.connection_context(): try: return _save_requests_db(entries, single, score, nx, xx) # type: ignore[call-overload] except Exception: _arg_msg = _gen_arg_msg(entries, single, score, nx, xx) logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_save_requests_db({_arg_msg})') return None return _save_requests_redis(entries, single, score, nx, xx)
def process_crawler() -> None: """A worker to run the :func:`~darc.crawl.crawler` process. Warns: HookExecutionFailed: When hook function raises an error. """ logger.info('[CRAWLER] Starting mainloop...') logger.debug('[CRAWLER] Starting first round...') # start mainloop while True: # requests crawler link_pool = load_requests() if not link_pool: if DARC_WAIT is not None: time.sleep(DARC_WAIT) continue for link in link_pool: crawler(link) time2break = False for hook in _HOOK_REGISTRY: try: hook('crawler', link_pool) except WorkerBreak: time2break = True except Exception: logger.pexc(LOG_WARNING, '[CRAWLER] hook execution failed', HookExecutionFailed) # marked to break by hook function if time2break: break # quit in reboot mode if REBOOT: break # renew Tor session renew_tor_session() logger.debug('[CRAWLER] Starting next round...') logger.info('[CRAWLER] Stopping mainloop...')
def _load_selenium_redis() -> 'List[Link]': """Load link from the :mod:`selenium` database. The function reads the ``queue_selenium`` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`selenium` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. """ now = time.time() if TIME_CACHE is None: sec_delta = 0 # type: float max_score = now else: sec_delta = TIME_CACHE.total_seconds() max_score = now - sec_delta try: with _redis_get_lock('queue_selenium'): temp_pool = [_redis_command('get', name) for name in _redis_command('zrangebyscore', 'queue_selenium', min=0, max=max_score, start=0, num=MAX_POOL)] # type: List[bytes] # pylint: disable=line-too-long link_pool = [ pickle.loads(link) for link in filter(None, temp_pool) ] # nosec: B301 if TIME_CACHE is not None: new_score = now + sec_delta _save_selenium_redis(link_pool, score=new_score) # force update records except pottery_exceptions.PotteryError: logger.pexc( LOG_WARNING, f'[SELENIUM] Failed to acquire Redis lock after {LOCK_TIMEOUT} second(s)', LockWarning, "_redis_get_lock('queue_selenium')") link_pool = [] return link_pool
def crawler( timestamp: 'datetime', session: 'Session', link: 'darc_link.Link') -> 'NoReturn': # pylint: disable=unused-argument """Crawler hook for data URIs. Args: timestamp: Timestamp of the worker node reference. session (:class:`requests.Session`): Session object with proxy settings. link: Link object to be crawled. Raises: LinkNoReturn: This link has no return response. """ try: save_data(link) except ValueError: logger.pexc( message=f'[REQUESTS] Failed to save data URI from {link.url}') raise LinkNoReturn(link)
def _redis_command(command: str, *args: 'Any', **kwargs: 'Any') -> 'Any': """Wrapper function for Redis command. Args: command: Command name. *args: Arbitrary arguments for the Redis command. Keyword Args: **kwargs: Arbitrary keyword arguments for the Redis command. Return: Values returned from the Redis command. Warns: RedisCommandFailed: Warns at each round when the command failed. See Also: Between each retry, the function sleeps for :data:`~darc.db.RETRY_INTERVAL` second(s) if such value is **NOT** :data:`None`. """ _arg_msg = None method = getattr(redis, command) while True: try: value = method(*args, **kwargs) except (redis_lib.exceptions.RedisError, pottery_exceptions.PotteryError): if _arg_msg is None: _arg_msg = _gen_arg_msg(*args, **kwargs) logger.pexc(LOG_WARNING, category=RedisCommandFailed, line=f'value = redis.{command}({_arg_msg})') if RETRY_INTERVAL is not None: time.sleep(RETRY_INTERVAL) continue break return value
def drop_selenium(link: 'Link') -> None: # pylint: disable=inconsistent-return-statements """Remove link from the :mod:`selenium` database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_selenium_db` * :func:`darc.db._drop_selenium_redis` """ if FLAG_DB: with database.connection_context(): try: return _drop_selenium_db(link) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_drop_selenium_db({link.url})') return None return _drop_selenium_redis(link)
def drop_hostname(link: 'Link') -> None: """Remove link from the hostname database. Args: link: Link to be removed. See Also: * :func:`darc.db._drop_hostname_db` * :func:`darc.db._drop_hostname_redis` """ if FLAG_DB: with database.connection_context(): try: return _drop_hostname_db(link) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line=f'_drop_hostname_db({link.url})') return None return _drop_hostname_redis(link)
def freenet_bootstrap() -> None: """Bootstrap wrapper for Freenet. The function will bootstrap the Freenet proxy. It will retry for :data:`~darc.proxy.freenet.FREENET_RETRY` times in case of failure. Also, it will **NOT** re-bootstrap the proxy as is guaranteed by :data:`~darc.proxy.freenet._FREENET_BS_FLAG`. Warns: FreenetBootstrapFailed: If failed to bootstrap Freenet proxy. Raises: :exc:`UnsupportedPlatform`: If the system is not supported, i.e. not macOS or Linux. See Also: * :func:`darc.proxy.freenet._freenet_bootstrap` * :data:`darc.proxy.freenet.FREENET_RETRY` * :data:`darc.proxy.freenet._FREENET_BS_FLAG` """ if _unsupported: raise UnsupportedPlatform(f'unsupported system: {platform.system()}') # don't re-bootstrap if _FREENET_BS_FLAG: return logger.info('-*- Freenet Bootstrap -*-') for _ in range(FREENET_RETRY + 1): try: _freenet_bootstrap() break except Exception: if DEBUG: logger.ptb('[Error bootstraping Freenet proxy]') logger.pexc(LOG_WARNING, category=FreenetBootstrapFailed, line='freenet_bootstrap()') logger.pline(LOG_INFO, logger.horizon)
def load_requests(check: bool = CHECK) -> 'List[Link]': """Load link from the :mod:`requests` database. Args: check: If perform checks on loaded links, default to :data:`~darc.const.CHECK`. Returns: List of loaded links from the :mod:`requests` database. Note: At runtime, the function will load links with maximum number at :data:`~darc.db.MAX_POOL` to limit the memory usage. See Also: * :func:`darc.db._load_requests_db` * :func:`darc.db._load_requests_redis` """ if FLAG_DB: with database.connection_context(): try: link_pool = _load_requests_db() except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='_load_requests_db()') link_pool = [] else: link_pool = _load_requests_redis() if check: link_pool = _check(link_pool) logger.plog(LOG_VERBOSE, '-*- [REQUESTS] LINK POOL -*-', object=sorted(link.url for link in link_pool)) return link_pool
def main(argv: 'Optional[List[str]]' = None) -> int: """Entrypoint. Args: argv: Optional command line arguments. Returns: Exit code. """ parser = get_parser() args = parser.parse_args(argv) pid = os.getpid() with open(PATH_ID, 'w') as file: print(pid, file=file) # wait for Redis if _WAIT_REDIS: if not FLAG_DB: _redis_command('set', 'darc', pid) if FLAG_DB: while True: try: with DB: _db_operation(DB.create_tables, [ HostnameQueueModel, RequestsQueueModel, SeleniumQueueModel, ]) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='DB.create_tables([HostnameQueueModel, ...]') continue break if SAVE_DB: while True: try: with DB_WEB: _db_operation(DB_WEB.create_tables, [ HostnameModel, URLModel, URLThroughModel, RobotsModel, SitemapModel, HostsModel, RequestsModel, RequestsHistoryModel, SeleniumModel, ]) except Exception: logger.pexc(LOG_WARNING, category=DatabaseOperaionFailed, line='DB.create_tables([HostnameModel, ...]') continue break logger.debug('-*- Initialisation -*-') if DEBUG and not FLAG_DB: # nuke the db _redis_command('delete', 'queue_hostname') _redis_command('delete', 'queue_requests') _redis_command('delete', 'queue_selenium') link_list = [] for link in filter( None, map(lambda s: s.strip(), args.link)): # type: ignore[name-defined,var-annotated] logger.pline(LOG_DEBUG, link) link_list.append(link) if args.file is not None: for path in args.file: with open(path) as file: for line in filter(None, map(lambda s: s.strip(), file)): if line.startswith('#'): continue logger.pline(LOG_DEBUG, line) link_list.append(line) # write to database link_pool = [parse_link(link, backref=None) for link in link_list] save_requests(link_pool, score=0, nx=True) logger.pline(LOG_DEBUG, logger.horizon) # init link file if not os.path.isfile(PATH_LN): with open(PATH_LN, 'w') as file: print('proxy,scheme,host,hash,link', file=file) try: process(args.type) except BaseException: traceback.print_exc() _exit() return 0
def crawler(link: 'darc_link.Link') -> None: """Single :mod:`requests` crawler for an entry link. Args: link: URL to be crawled by :mod:`requests`. The function will first parse the URL using :func:`~darc.link.parse_link`, and check if need to crawl the URL (c.f. :data:`~darc.const.PROXY_WHITE_LIST`, :data:`~darc.const.PROXY_BLACK_LIST`, :data:`~darc.const.LINK_WHITE_LIST` and :data:`~darc.const.LINK_BLACK_LIST`); if true, then crawl the URL with :mod:`requests`. If the URL is from a brand new host, :mod:`darc` will first try to fetch and save ``robots.txt`` and sitemaps of the host (c.f. :func:`~darc.proxy.null.save_robots` and :func:`~darc.proxy.null.save_sitemap`), and extract then save the links from sitemaps (c.f. :func:`~darc.proxy.null.read_sitemap`) into link database for future crawling (c.f. :func:`~darc.db.save_requests`). .. note:: A host is new if :func:`~darc.db.have_hostname` returns :data:`True`. If :func:`darc.proxy.null.fetch_sitemap` and/or :func:`darc.proxy.i2p.fetch_hosts` failed when fetching such documents, the host will be removed from the hostname database through :func:`~darc.db.drop_hostname`, and considered as new when next encounter. Also, if the submission API is provided, :func:`~darc.submit.submit_new_host` will be called and submit the documents just fetched. If ``robots.txt`` presented, and :data:`~darc.const.FORCE` is :data:`False`, :mod:`darc` will check if allowed to crawl the URL. .. note:: The root path (e.g. ``/`` in https://www.example.com/) will always be crawled ignoring ``robots.txt``. At this point, :mod:`darc` will call the customised hook function from :mod:`darc.sites` to crawl and get the final response object. :mod:`darc` will save the session cookies and header information, using :func:`~darc.save.save_headers`. .. note:: If :exc:`requests.exceptions.InvalidSchema` is raised, the link will be saved by :func:`~darc.proxy.null.save_invalid`. Further processing is dropped, and the link will be removed from the :mod:`requests` database through :func:`~darc.db.drop_requests`. If :exc:`~darc.error.LinkNoReturn` is raised, the link will be removed from the :mod:`requests` database through :func:`~darc.db.drop_requests`. If the content type of response document is not ignored (c.f. :data:`~darc.const.MIME_WHITE_LIST` and :data:`~darc.const.MIME_BLACK_LIST`), :func:`~darc.submit.submit_requests` will be called and submit the document just fetched. If the response document is HTML (``text/html`` and ``application/xhtml+xml``), :func:`~darc.parse.extract_links` will be called then to extract all possible links from the HTML document and save such links into the database (c.f. :func:`~darc.db.save_requests`). And if the response status code is between ``400`` and ``600``, the URL will be saved back to the link database (c.f. :func:`~darc.db.save_requests`). If **NOT**, the URL will be saved into :mod:`selenium` link database to proceed next steps (c.f. :func:`~darc.db.save_selenium`). """ logger.info('[REQUESTS] Requesting %s', link.url) try: if match_proxy(link.proxy): logger.warning('[REQUESTS] Ignored proxy type from %s (%s)', link.url, link.proxy) drop_requests(link) return if match_host(link.host): logger.warning('[REQUESTS] Ignored hostname from %s (%s)', link.url, link.proxy) drop_requests(link) return # timestamp timestamp = datetime.now() # get the session object in advance session = request_session(link) # check whether schema supported by :mod:`requests` try: session.get_adapter(link.url) # test for adapter requests_supported = True except requests.exceptions.InvalidSchema: requests_supported = False # if need to test for new host if requests_supported: # if it's a new host flag_have, force_fetch = have_hostname(link) if not flag_have or force_fetch: partial = False if link.proxy not in ('zeronet', 'freenet'): # fetch sitemap.xml try: fetch_sitemap(link, force=force_fetch) except Exception: logger.ptb('[Error fetching sitemap of %s]', link.url) partial = True if link.proxy == 'i2p': # fetch hosts.txt try: fetch_hosts(link, force=force_fetch) except Exception: logger.ptb('[Error subscribing hosts from %s]', link.url) partial = True # submit data / drop hostname from db if partial: drop_hostname(link) submit_new_host(timestamp, link, partial=partial, force=force_fetch) if not FORCE and not check_robots(link): logger.warning('[REQUESTS] Robots disallowed link from %s', link.url) return # reuse the session object with session: try: # requests session hook response = crawler_hook(timestamp, session, link) except requests.exceptions.InvalidSchema: logger.pexc(message=f'[REQUESTS] Fail to crawl {link.url}') save_invalid(link) drop_requests(link) return except requests.RequestException as error: logger.pexc(message=f'[REQUESTS] Fail to crawl {link.url}') save_requests(link, single=True) return except LinkNoReturn as error: logger.pexc(LOG_WARNING, f'[REQUESTS] Removing from database: {link.url}') if error.drop: drop_requests(link) return # save headers save_headers(timestamp, link, response, session) # check content type ct_type = get_content_type(response) if ct_type not in ['text/html', 'application/xhtml+xml']: logger.warning('[REQUESTS] Generic content type from %s (%s)', link.url, ct_type) # probably hosts.txt if link.proxy == 'i2p' and ct_type in [ 'text/plain', 'text/text' ]: text = response.text save_requests(read_hosts(link, text)) if match_mime(ct_type): drop_requests(link) return # submit data data = response.content submit_requests(timestamp, link, response, session, data, mime_type=ct_type, html=False) return html = response.content if not html: logger.error('[REQUESTS] Empty response from %s', link.url) save_requests(link, single=True) return # submit data submit_requests(timestamp, link, response, session, html, mime_type=ct_type, html=True) # add link to queue save_requests(extract_links(link, html), score=0, nx=True) if not response.ok: logger.error('[REQUESTS] Failed on %s [%d]', link.url, response.status_code) save_requests(link, single=True) return # add link to queue save_selenium(link, single=True, score=0, nx=True) except Exception: if SAVE_DB: with contextlib.suppress(Exception): host = HostnameModel.get_or_none( HostnameModel.hostname == link.host) # type: Optional[HostnameModel] if host is not None: host.alive = False host.save() with contextlib.suppress(Exception): url = URLModel.get_or_none( URLModel.hash == link.name) # type: Optional[URLModel] if url is not None: url.alias = False url.save() logger.ptb('[Error from %s]', link.url) save_requests(link, single=True) logger.info('[REQUESTS] Requested %s', link.url)
def fetch_sitemap(link: 'darc_link.Link', force: bool = False) -> None: """Fetch sitemap. The function will first fetch the ``robots.txt``, then fetch the sitemaps accordingly. Args: link: Link object to fetch for its sitemaps. force: Force refetch its sitemaps. Returns: Contents of ``robots.txt`` and sitemaps. See Also: * :func:`darc.proxy.null.read_robots` * :func:`darc.proxy.null.read_sitemap` * :func:`darc.parse.get_sitemap` """ if force: logger.warning('[ROBOTS] Force refetch %s', link.url) robots_path = None if force else have_robots(link) if robots_path is not None: logger.warning('[ROBOTS] Cached %s', link.url) with open(robots_path) as file: robots_text = file.read() else: robots_link = parse_link(urljoin(link.url, '/robots.txt'), backref=link) logger.info('[ROBOTS] Checking %s', robots_link.url) with request_session(robots_link) as session: try: response = session.get(robots_link.url) except requests.RequestException: logger.pexc(message=f'[ROBOTS] Failed on {robots_link.url}') return if response.ok: ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: logger.error('[ROBOTS] Unresolved content type on %s (%s)', robots_link.url, ct_type) robots_text = '' else: robots_text = response.text save_robots(robots_link, robots_text) logger.info('[ROBOTS] Checked %s', robots_link.url) else: logger.error('[ROBOTS] Failed on %s [%d]', robots_link.url, response.status_code) robots_text = '' if force: logger.warning('[SITEMAP] Force refetch %s', link.url) sitemaps = read_robots(link, robots_text, host=link.host) for sitemap_link in sitemaps: sitemap_path = None if force else have_sitemap(sitemap_link) if sitemap_path is not None: logger.warning('[SITEMAP] Cached %s', sitemap_link.url) with open(sitemap_path) as file: sitemap_text = file.read() else: logger.info('[SITEMAP] Fetching %s', sitemap_link.url) with request_session(sitemap_link) as session: try: response = session.get(sitemap_link.url) except requests.RequestException: logger.pexc(message=f'[SITEMAP] Failed on {sitemap_link.url}') continue if not response.ok: logger.error('[SITEMAP] Failed on %s [%d]', sitemap_link.url, response.status_code) continue # check content type ct_type = get_content_type(response) if ct_type == 'application/gzip': try: sitemap_text = gzip.decompress(response.content).decode() except UnicodeDecodeError: sitemap_text = response.text elif ct_type in ['text/xml', 'text/html']: sitemap_text = response.text save_sitemap(sitemap_link, sitemap_text) else: logger.error('[SITEMAP] Unresolved content type on %s (%s)', sitemap_link.url, ct_type) continue logger.info('[SITEMAP] Fetched %s', sitemap_link.url) # get more sitemaps sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host)) # add link to queue save_requests(read_sitemap(link, sitemap_text))
def loader(link: 'darc_link.Link') -> None: """Single :mod:`selenium` loader for an entry link. Args: Link: URL to be crawled by :mod:`selenium`. The function will first parse the URL using :func:`~darc.link.parse_link` and start loading the URL using :mod:`selenium` with Google Chrome. At this point, :mod:`darc` will call the customised hook function from :mod:`darc.sites` to load and return the original :class:`selenium.webdriver.chrome.webdriver.WebDriver` object. .. note:: If :exc:`~darc.error.LinkNoReturn` is raised, the link will be removed from the :mod:`selenium` database through :func:`~darc.db.drop_selenium`. If successful, the rendered source HTML document will be saved, and a full-page screenshot will be taken and saved. .. note:: When taking full-page screenshot, :func:`~darc.crawl.loader` will use :javascript:`document.body.scrollHeight` to get the total height of web page. If the page height is *less than* **1,000 pixels**, then :mod:`darc` will by default set the height as **1,000 pixels**. Later :mod:`darc` will tell :mod:`selenium` to resize the window (in *headless* mode) to **1,024 pixels** in width and **110%** of the page height in height, and take a *PNG* screenshot. If the submission API is provided, :func:`~darc.submit.submit_selenium` will be called and submit the document just loaded. Later, :func:`~darc.parse.extract_links` will be called then to extract all possible links from the HTML document and save such links into the :mod:`requests` database (c.f. :func:`~darc.db.save_requests`). .. seealso:: * :data:`darc.const.SE_EMPTY` * :data:`darc.const.SE_WAIT` """ logger.info('[SELENIUM] Loading %s', link.url) try: # timestamp timestamp = datetime.now() # retrieve source from Chrome with request_driver(link) as driver: try: # selenium driver hook driver = loader_hook(timestamp, driver, link) except urllib3_exceptions.HTTPError: logger.pexc(message=f'[SELENIUM] Fail to load {link.url}') save_selenium(link, single=True) return except selenium_exceptions.WebDriverException as error: logger.pexc(message=f'[SELENIUM] Fail to load {link.url}') save_selenium(link, single=True) return except LinkNoReturn as error: logger.pexc(LOG_WARNING, f'[SELENIUM] Removing from database: {link.url}') if error.drop: drop_selenium(link) return # get HTML source html = driver.page_source if html == SE_EMPTY: logger.error('[SELENIUM] Empty page from %s', link.url) save_selenium(link, single=True) return screenshot = None try: # get maximum height height = driver.execute_script( 'return document.body.scrollHeight') # resize window (with some magic numbers) driver.set_window_size(1024, math.ceil(max(height, 1000) * 1.1)) # take a full page screenshot screenshot = driver.get_screenshot_as_base64() except Exception: logger.pexc( message= f'[SELENIUM] Fail to save screenshot from {link.url}') # submit data submit_selenium(timestamp, link, html, screenshot) # add link to queue save_requests(extract_links(link, html), score=0, nx=True) except Exception: logger.ptb('[Error from %s]', link.url) save_selenium(link, single=True) logger.info('[SELENIUM] Loaded %s', link.url)