示例#1
0
def _have_hostname_db(link: 'Link') -> 'Tuple[bool, bool]':
    """Check if current link is a new host.

    The function checks the :class:`~darc.models.tasks.hostname.HostnameQueueModel` table.

    Args:
        link: Link to check against.

    Returns:
        A tuple of two :obj:`bool` values representing
        if such link is a known host and needs force
        refetch respectively.

    """
    timestamp = datetime.now()
    if TIME_CACHE is None:
        threshold = math.inf
    else:
        threshold = (timestamp - TIME_CACHE).timestamp()

    model, created = cast(
        'Tuple[HostnameQueueModel, bool]',
        _db_operation(HostnameQueueModel.get_or_create,
                      hostname=link.host,
                      defaults={
                          'timestamp': timestamp,
                      }))
    if created:
        return False, False
    return True, model.timestamp.timestamp() < threshold
示例#2
0
def save_data(link: 'darc_link.Link') -> None:
    """Save data URI.

    The function will save data URIs to the data storage
    as defined in :data:`~darc.proxy.data.PATH`.

    Args:
        link: Link object representing the data URI.

    """
    data = datauri.DataURI(link.url)
    ext = mimetypes.guess_extension(data.mimetype) or '.dat'
    ts = datetime.now().isoformat()

    path = os.path.join(PATH, f'{link.name}_{ts}{ext}')
    with open(path, 'wb') as file:
        file.write(data.data)

    with LOCK:  # type: ignore[union-attr]
        with open(PATH_MAP, 'a') as data_file:
            print(
                json.dumps({
                    'src':
                    backref.url if
                    (backref := link.url_backref) is not None else None,  # pylint: disable=used-before-assignment
                    'url':
                    path,
                }),
                file=data_file)
示例#3
0
def _load_selenium_db() -> 'List[Link]':
    """Load link from the :mod:`selenium` database.

    The function reads the :class:`~darc.model.tasks.selenium.SeleniumQueueModel` table.

    Returns:
        List of loaded links from the :mod:`selenium` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    """
    now = datetime.now()
    if TIME_CACHE is None:
        sec_delta = timedelta(seconds=0)
        max_score = now
    else:
        sec_delta = TIME_CACHE
        max_score = now - sec_delta

    with database.atomic():
        query = _db_operation(
            SeleniumQueueModel.select(SeleniumQueueModel.link).where(
                SeleniumQueueModel.timestamp <= max_score).order_by(
                    SeleniumQueueModel.timestamp).limit(
                        MAX_POOL).query)  # type: List[SeleniumQueueModel]
        link_pool = [model.link for model in query]

        # force update records
        if TIME_CACHE is not None:
            new_score = (now + sec_delta).timestamp()
            _save_selenium_db(link_pool, score=new_score)
    return link_pool
示例#4
0
def sanitise(
        link: Link,
        time: typing.Optional[typing.Datetime] = None,  # pylint: disable=redefined-outer-name
        raw: bool = False,
        data: bool = False,
        headers: bool = False,
        screenshot: bool = False) -> str:
    """Sanitise link to path.

    Args:
        link: Link object to sanitise the path
        time (datetime): Timestamp for the path.
        raw: If this is a raw HTML document from :mod:`requests`.
        data: If this is a generic content type document.
        headers: If this is response headers from :mod:`requests`.
        screenshot: If this is the screenshot from :mod:`selenium`.

    Returns:
        * If ``raw`` is :data:`True`,
          ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>_raw.html``.
        * If ``data`` is :data:`True`,
          ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.dat``.
        * If ``headers`` is :data:`True`,
          ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.json``.
        * If ``screenshot`` is :data:`True`,
          ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.png``.
        * If none above,
          ``<root>/<proxy>/<scheme>/<hostname>/<hash>_<timestamp>.html``.

    See Also:
        * :func:`darc.crawl.crawler`
        * :func:`darc.crawl.loader`

    """
    os.makedirs(link.base, exist_ok=True)

    path = os.path.join(link.base, link.name)
    if time is None:
        time = datetime.now()
    ts = time.isoformat()

    if raw:
        return f'{path}_{ts}_raw.html'
    if headers:
        return f'{path}_{ts}.json'
    if data:
        return f'{path}_{ts}.dat'
    if screenshot:
        return f'{path}_{ts}.png'
    return f'{path}_{ts}.html'
示例#5
0
def save_data(link: Link):
    """Save data URI.

    The function will save data URIs to the data storage
    as defined in :data:`~darc.proxy.data.PATH`.

    Args:
        link: Link object representing the data URI.

    """
    data = datauri.DataURI(link.url)
    ext = mimetypes.guess_extension(data.mimetype) or '.dat'
    ts = datetime.now().isoformat()

    path = os.path.join(PATH, f'{link.name}_{ts}{ext}')
    with open(path, 'wb') as file:
        file.write(data.data)
示例#6
0
def crawler(link: Link):
    """Single :mod:`requests` crawler for a entry link.

    Args:
        link: URL to be crawled by :mod:`requests`.

    The function will first parse the URL using
    :func:`~darc.link.parse_link`, and check if need to crawl the
    URL (c.f. :data:`~darc.const.PROXY_WHITE_LIST`, :data:`~darc.const.PROXY_BLACK_LIST`,
    :data:`~darc.const.LINK_WHITE_LIST` and :data:`~darc.const.LINK_BLACK_LIST`);
    if true, then crawl the URL with :mod:`requests`.

    If the URL is from a brand new host, :mod:`darc` will first try
    to fetch and save ``robots.txt`` and sitemaps of the host
    (c.f. :func:`~darc.proxy.null.save_robots` and :func:`~darc.proxy.null.save_sitemap`),
    and extract then save the links from sitemaps (c.f. :func:`~darc.proxy.null.read_sitemap`)
    into link database for future crawling (c.f. :func:`~darc.db.save_requests`).

    .. note::

       A host is new if :func:`~darc.db.have_hostname` returns :data:`True`.

       If :func:`darc.proxy.null.fetch_sitemap` and/or :func:`darc.proxy.i2p.fetch_hosts`
       failed when fetching such documents, the host will be removed from the
       hostname database through :func:`~darc.db.drop_hostname`, and considered
       as new when next encounter.

    Also, if the submission API is provided, :func:`~darc.submit.submit_new_host`
    will be called and submit the documents just fetched.

    If ``robots.txt`` presented, and :data:`~darc.const.FORCE` is
    :data:`False`, :mod:`darc` will check if allowed to crawl the URL.

    .. note::

        The root path (e.g. ``/`` in https://www.example.com/) will always
        be crawled ignoring ``robots.txt``.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to crawl and get the final response object.
    :mod:`darc` will save the session cookies and header information,
    using :func:`~darc.save.save_headers`.

    .. note::

        If :exc:`requests.exceptions.InvalidSchema` is raised, the link
        will be saved by :func:`~darc.proxy.null.save_invalid`. Further
        processing is dropped, and the link will be removed from the
        :mod:`requests` database through :func:`~darc.db.drop_requests`.

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`requests` database through
        :func:`~darc.db.drop_requests`.

    If the content type of response document is not ignored (c.f.
    :data:`~darc.const.MIME_WHITE_LIST` and :data:`~darc.const.MIME_BLACK_LIST`),
    :func:`~darc.submit.submit_requests` will be called and submit the document
    just fetched.

    If the response document is HTML (``text/html`` and ``application/xhtml+xml``),
    :func:`~darc.parse.extract_links` will be called then to extract
    all possible links from the HTML document and save such links into
    the database (c.f. :func:`~darc.db.save_requests`).

    And if the response status code is between ``400`` and ``600``,
    the URL will be saved back to the link database
    (c.f. :func:`~darc.db.save_requests`). If **NOT**, the URL will
    be saved into :mod:`selenium` link database to proceed next steps
    (c.f. :func:`~darc.db.save_selenium`).

    """
    print(f'[REQUESTS] Requesting {link.url}')
    try:
        if match_proxy(link.proxy):
            print(render_error(
                f'[REQUESTS] Ignored proxy type from {link.url} ({link.proxy})',
                stem.util.term.Color.YELLOW),
                  file=sys.stderr)  # pylint: disable=no-member
            drop_requests(link)
            return

        if match_host(link.host):
            print(render_error(
                f'[REQUESTS] Ignored hostname from {link.url} ({link.proxy})',
                stem.util.term.Color.YELLOW),
                  file=sys.stderr)  # pylint: disable=no-member
            drop_requests(link)
            return

        # timestamp
        timestamp = datetime.now()

        # if it's a new host
        if not have_hostname(link):
            partial = False

            if link.proxy not in ('zeronet', 'freenet'):
                # fetch sitemap.xml
                try:
                    fetch_sitemap(link)
                except Exception:
                    error = f'[Error fetching sitemap of {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
                    print(render_error(error, stem.util.term.Color.CYAN),
                          file=sys.stderr)  # pylint: disable=no-member
                    partial = True

            if link.proxy == 'i2p':
                # fetch hosts.txt
                try:
                    fetch_hosts(link)
                except Exception:
                    error = f'[Error subscribing hosts from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
                    print(render_error(error, stem.util.term.Color.CYAN),
                          file=sys.stderr)  # pylint: disable=no-member
                    partial = True

            # submit data / drop hostname from db
            if partial:
                drop_hostname(link)
            submit_new_host(timestamp, link, partial=partial)

        if not FORCE and not check_robots(link):
            print(render_error(
                f'[REQUESTS] Robots disallowed link from {link.url}',
                stem.util.term.Color.YELLOW),
                  file=sys.stderr)  # pylint: disable=no-member
            return

        with request_session(link) as session:
            try:
                # requests session hook
                response = crawler_hook(link, session)
            except requests.exceptions.InvalidSchema as error:
                print(render_error(
                    f'[REQUESTS] Failed on {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_invalid(link)
                drop_requests(link)
                return
            except requests.RequestException as error:
                print(render_error(
                    f'[REQUESTS] Failed on {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_requests(link, single=True)
                return
            except LinkNoReturn:
                print(render_error(
                    f'[REQUESTS] Removing from database: {link.url}',
                    stem.util.term.Color.YELLOW),
                      file=sys.stderr)  # pylint: disable=no-member
                drop_requests(link)
                return

            # save headers
            save_headers(timestamp, link, response, session)

            # check content type
            ct_type = get_content_type(response)
            if ct_type not in ['text/html', 'application/xhtml+xml']:
                print(render_error(
                    f'[REQUESTS] Generic content type from {link.url} ({ct_type})',
                    stem.util.term.Color.YELLOW),
                      file=sys.stderr)  # pylint: disable=no-member

                # probably hosts.txt
                if link.proxy == 'i2p' and ct_type in [
                        'text/plain', 'text/text'
                ]:
                    text = response.text
                    save_requests(read_hosts(text))

                if match_mime(ct_type):
                    drop_requests(link)
                    return

                # submit data
                data = response.content
                submit_requests(timestamp,
                                link,
                                response,
                                session,
                                data,
                                mime_type=ct_type,
                                html=False)

                return

            html = response.content
            if not html:
                print(render_error(
                    f'[REQUESTS] Empty response from {link.url}',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_requests(link, single=True)
                return

            # submit data
            submit_requests(timestamp,
                            link,
                            response,
                            session,
                            html,
                            mime_type=ct_type,
                            html=True)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)

            if not response.ok:
                print(render_error(
                    f'[REQUESTS] Failed on {link.url} [{response.status_code}]',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_requests(link, single=True)
                return

            # add link to queue
            save_selenium(link, single=True, score=0, nx=True)
    except Exception:
        error = f'[Error from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
        print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr)  # pylint: disable=no-member
        save_requests(link, single=True)
    print(f'[REQUESTS] Requested {link.url}')
示例#7
0
def loader(link: Link):
    """Single :mod:`selenium` loader for a entry link.

    Args:
        Link: URL to be crawled by :mod:`selenium`.

    The function will first parse the URL using :func:`~darc.link.parse_link`
    and start loading the URL using :mod:`selenium` with Google Chrome.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to load and return the original
    :class:`selenium.webdriver.Chrome` object.

    .. note::

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`selenium` database through
        :func:`~darc.db.drop_selenium`.

    If successful, the rendered source HTML document will be saved, and a
    full-page screenshot will be taken and saved.

    .. note::

       When taking full-page screenshot, :func:`~darc.crawl.loader` will
       use :javascript:`document.body.scrollHeight` to get the total
       height of web page. If the page height is *less than* **1,000 pixels**,
       then :mod:`darc` will by default set the height as **1,000 pixels**.

       Later :mod:`darc` will tell :mod:`selenium` to resize the window (in
       *headless* mode) to **1,024 pixels** in width and **110%** of the
       page height in height, and take a *PNG* screenshot.

    If the submission API is provided, :func:`~darc.submit.submit_selenium`
    will be called and submit the document just loaded.

    Later, :func:`~darc.parse.extract_links` will be called then to
    extract all possible links from the HTML document and save such
    links into the :mod:`requests` database (c.f. :func:`~darc.db.save_requests`).

    .. seealso::

       * :data:`darc.const.SE_EMPTY`
       * :data:`darc.const.SE_WAIT`

    """
    print(f'[SELENIUM] Loading {link.url}')
    try:
        # timestamp
        timestamp = datetime.now()

        # retrieve source from Chrome
        with request_driver(link) as driver:
            try:
                # selenium driver hook
                driver = loader_hook(link, driver)
            except urllib3.exceptions.HTTPError as error:
                print(render_error(
                    f'[SELENIUM] Fail to load {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_selenium(link, single=True)
                return
            except selenium.common.exceptions.WebDriverException as error:
                print(render_error(
                    f'[SELENIUM] Fail to load {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_selenium(link, single=True)
                return
            except LinkNoReturn:
                print(render_error(
                    f'[SELENIUM] Removing from database: {link.url}',
                    stem.util.term.Color.YELLOW),
                      file=sys.stderr)  # pylint: disable=no-member
                drop_selenium(link)
                return

            # get HTML source
            html = driver.page_source

            if html == SE_EMPTY:
                print(render_error(f'[SELENIUM] Empty page from {link.url}',
                                   stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member
                save_selenium(link, single=True)
                return

            screenshot = None
            try:
                # get maximum height
                height = driver.execute_script(
                    'return document.body.scrollHeight')

                # resize window (with some magic numbers)
                if height < 1000:
                    height = 1000
                driver.set_window_size(1024, math.ceil(height * 1.1))

                # take a full page screenshot
                screenshot = driver.get_screenshot_as_base64()
            except Exception as error:
                print(render_error(
                    f'[SELENIUM] Fail to save screenshot from {link.url} <{error}>',
                    stem.util.term.Color.RED),
                      file=sys.stderr)  # pylint: disable=no-member

            # submit data
            submit_selenium(timestamp, link, html, screenshot)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)
    except Exception:
        error = f'[Error from {link.url}]' + os.linesep + traceback.format_exc() + '-' * shutil.get_terminal_size().columns  # pylint: disable=line-too-long
        print(render_error(error, stem.util.term.Color.CYAN), file=sys.stderr)  # pylint: disable=no-member
        save_selenium(link, single=True)
    print(f'[SELENIUM] Loaded {link.url}')
示例#8
0
def _save_selenium_db(entries: 'Union[Link, List[Link]]',
                      single: bool = False,
                      score: 'Optional[float]' = None,
                      nx: bool = False,
                      xx: bool = False) -> None:
    """Save link to the :mod:`selenium` database.

    The function updates the :class:`~darc.model.tasks.selenium.SeleniumQueueModel` table.

    Args:
        entries: Links to be added to the :mod:`selenium` database.
            It can be either a :obj:`list` of links, or a single
            link string (if ``single`` set as :data:`True`).
        single: Indicate if ``entries`` is a :obj:`list` of links
            or a single link string.
        score: Score to for the Redis sorted set.
        nx: Only create new elements and not to
            update scores for elements that already exist.
        xx: Only update scores of elements that
            already exist. New elements will not be added.

    """
    if not entries:
        return None
    if score is None:
        timestamp = datetime.now()
    else:
        timestamp = datetime.fromtimestamp(score)

    if not single:
        if TYPE_CHECKING:
            entries = cast('List[Link]', entries)

        if nx:
            with database.atomic():
                insert_many = [{
                    'text': link.url,
                    'hash': link.name,
                    'link': link,
                    'timestamp': timestamp,
                } for link in entries]
                for batch in peewee.chunked(insert_many, BULK_SIZE):
                    _db_operation(
                        SeleniumQueueModel.insert_many(
                            insert_many).on_conflict_ignore().execute)
            return None

        if xx:
            entries_text = [link.url for link in entries]
            _db_operation(
                SeleniumQueueModel.update(timestamp=timestamp).where(
                    cast('TextField',
                         SeleniumQueueModel.text).in_(entries_text)).execute)
            return None

        with database.atomic():
            replace_many = [{
                'text': link.url,
                'hash': link.name,
                'link': link,
                'timestamp': timestamp,
            } for link in entries]
            for batch in peewee.chunked(replace_many, BULK_SIZE):
                _db_operation(SeleniumQueueModel.replace_many(batch).execute)
        return None

    if TYPE_CHECKING:
        entries = cast('Link', entries)

    if nx:
        _db_operation(SeleniumQueueModel.get_or_create,
                      text=entries.url,
                      defaults={
                          'hash': entries.name,
                          'link': entries,
                          'timestamp': timestamp,
                      })
        return None

    if xx:
        with contextlib.suppress(peewee.DoesNotExist):
            model = _db_operation(SeleniumQueueModel.get, SeleniumQueueModel.text == entries.url)  # type: SeleniumQueueModel # pylint: disable=line-too-long
            model.timestamp = timestamp
            _db_operation(model.save)
        return None

    _db_operation(
        SeleniumQueueModel.replace(
            text=entries.url,
            hash=entries.name,
            link=entries,
            timestamp=timestamp,
        ).execute)
    return None
示例#9
0
文件: crawl.py 项目: JarryShaw/darc
def crawler(link: 'darc_link.Link') -> None:
    """Single :mod:`requests` crawler for an entry link.

    Args:
        link: URL to be crawled by :mod:`requests`.

    The function will first parse the URL using
    :func:`~darc.link.parse_link`, and check if need to crawl the
    URL (c.f. :data:`~darc.const.PROXY_WHITE_LIST`, :data:`~darc.const.PROXY_BLACK_LIST`,
    :data:`~darc.const.LINK_WHITE_LIST` and :data:`~darc.const.LINK_BLACK_LIST`);
    if true, then crawl the URL with :mod:`requests`.

    If the URL is from a brand new host, :mod:`darc` will first try
    to fetch and save ``robots.txt`` and sitemaps of the host
    (c.f. :func:`~darc.proxy.null.save_robots` and :func:`~darc.proxy.null.save_sitemap`),
    and extract then save the links from sitemaps (c.f. :func:`~darc.proxy.null.read_sitemap`)
    into link database for future crawling (c.f. :func:`~darc.db.save_requests`).

    .. note::

       A host is new if :func:`~darc.db.have_hostname` returns :data:`True`.

       If :func:`darc.proxy.null.fetch_sitemap` and/or :func:`darc.proxy.i2p.fetch_hosts`
       failed when fetching such documents, the host will be removed from the
       hostname database through :func:`~darc.db.drop_hostname`, and considered
       as new when next encounter.

    Also, if the submission API is provided, :func:`~darc.submit.submit_new_host`
    will be called and submit the documents just fetched.

    If ``robots.txt`` presented, and :data:`~darc.const.FORCE` is
    :data:`False`, :mod:`darc` will check if allowed to crawl the URL.

    .. note::

        The root path (e.g. ``/`` in https://www.example.com/) will always
        be crawled ignoring ``robots.txt``.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to crawl and get the final response object.
    :mod:`darc` will save the session cookies and header information,
    using :func:`~darc.save.save_headers`.

    .. note::

        If :exc:`requests.exceptions.InvalidSchema` is raised, the link
        will be saved by :func:`~darc.proxy.null.save_invalid`. Further
        processing is dropped, and the link will be removed from the
        :mod:`requests` database through :func:`~darc.db.drop_requests`.

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`requests` database through
        :func:`~darc.db.drop_requests`.

    If the content type of response document is not ignored (c.f.
    :data:`~darc.const.MIME_WHITE_LIST` and :data:`~darc.const.MIME_BLACK_LIST`),
    :func:`~darc.submit.submit_requests` will be called and submit the document
    just fetched.

    If the response document is HTML (``text/html`` and ``application/xhtml+xml``),
    :func:`~darc.parse.extract_links` will be called then to extract
    all possible links from the HTML document and save such links into
    the database (c.f. :func:`~darc.db.save_requests`).

    And if the response status code is between ``400`` and ``600``,
    the URL will be saved back to the link database
    (c.f. :func:`~darc.db.save_requests`). If **NOT**, the URL will
    be saved into :mod:`selenium` link database to proceed next steps
    (c.f. :func:`~darc.db.save_selenium`).

    """
    logger.info('[REQUESTS] Requesting %s', link.url)
    try:
        if match_proxy(link.proxy):
            logger.warning('[REQUESTS] Ignored proxy type from %s (%s)',
                           link.url, link.proxy)
            drop_requests(link)
            return

        if match_host(link.host):
            logger.warning('[REQUESTS] Ignored hostname from %s (%s)',
                           link.url, link.proxy)
            drop_requests(link)
            return

        # timestamp
        timestamp = datetime.now()

        # get the session object in advance
        session = request_session(link)

        # check whether schema supported by :mod:`requests`
        try:
            session.get_adapter(link.url)  # test for adapter
            requests_supported = True
        except requests.exceptions.InvalidSchema:
            requests_supported = False

        # if need to test for new host
        if requests_supported:
            # if it's a new host
            flag_have, force_fetch = have_hostname(link)
            if not flag_have or force_fetch:
                partial = False

                if link.proxy not in ('zeronet', 'freenet'):
                    # fetch sitemap.xml
                    try:
                        fetch_sitemap(link, force=force_fetch)
                    except Exception:
                        logger.ptb('[Error fetching sitemap of %s]', link.url)
                        partial = True

                if link.proxy == 'i2p':
                    # fetch hosts.txt
                    try:
                        fetch_hosts(link, force=force_fetch)
                    except Exception:
                        logger.ptb('[Error subscribing hosts from %s]',
                                   link.url)
                        partial = True

                # submit data / drop hostname from db
                if partial:
                    drop_hostname(link)
                submit_new_host(timestamp,
                                link,
                                partial=partial,
                                force=force_fetch)

            if not FORCE and not check_robots(link):
                logger.warning('[REQUESTS] Robots disallowed link from %s',
                               link.url)
                return

        # reuse the session object
        with session:
            try:
                # requests session hook
                response = crawler_hook(timestamp, session, link)
            except requests.exceptions.InvalidSchema:
                logger.pexc(message=f'[REQUESTS] Fail to crawl {link.url}')
                save_invalid(link)
                drop_requests(link)
                return
            except requests.RequestException as error:
                logger.pexc(message=f'[REQUESTS] Fail to crawl {link.url}')
                save_requests(link, single=True)
                return
            except LinkNoReturn as error:
                logger.pexc(LOG_WARNING,
                            f'[REQUESTS] Removing from database: {link.url}')
                if error.drop:
                    drop_requests(link)
                return

            # save headers
            save_headers(timestamp, link, response, session)

            # check content type
            ct_type = get_content_type(response)
            if ct_type not in ['text/html', 'application/xhtml+xml']:
                logger.warning('[REQUESTS] Generic content type from %s (%s)',
                               link.url, ct_type)

                # probably hosts.txt
                if link.proxy == 'i2p' and ct_type in [
                        'text/plain', 'text/text'
                ]:
                    text = response.text
                    save_requests(read_hosts(link, text))

                if match_mime(ct_type):
                    drop_requests(link)
                    return

                # submit data
                data = response.content
                submit_requests(timestamp,
                                link,
                                response,
                                session,
                                data,
                                mime_type=ct_type,
                                html=False)

                return

            html = response.content
            if not html:
                logger.error('[REQUESTS] Empty response from %s', link.url)
                save_requests(link, single=True)
                return

            # submit data
            submit_requests(timestamp,
                            link,
                            response,
                            session,
                            html,
                            mime_type=ct_type,
                            html=True)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)

            if not response.ok:
                logger.error('[REQUESTS] Failed on %s [%d]', link.url,
                             response.status_code)
                save_requests(link, single=True)
                return

            # add link to queue
            save_selenium(link, single=True, score=0, nx=True)
    except Exception:
        if SAVE_DB:
            with contextlib.suppress(Exception):
                host = HostnameModel.get_or_none(
                    HostnameModel.hostname ==
                    link.host)  # type: Optional[HostnameModel]
                if host is not None:
                    host.alive = False
                    host.save()

            with contextlib.suppress(Exception):
                url = URLModel.get_or_none(
                    URLModel.hash == link.name)  # type: Optional[URLModel]
                if url is not None:
                    url.alias = False
                    url.save()

        logger.ptb('[Error from %s]', link.url)
        save_requests(link, single=True)

    logger.info('[REQUESTS] Requested %s', link.url)
示例#10
0
文件: crawl.py 项目: JarryShaw/darc
def loader(link: 'darc_link.Link') -> None:
    """Single :mod:`selenium` loader for an entry link.

    Args:
        Link: URL to be crawled by :mod:`selenium`.

    The function will first parse the URL using :func:`~darc.link.parse_link`
    and start loading the URL using :mod:`selenium` with Google Chrome.

    At this point, :mod:`darc` will call the customised hook function
    from :mod:`darc.sites` to load and return the original
    :class:`selenium.webdriver.chrome.webdriver.WebDriver` object.

    .. note::

        If :exc:`~darc.error.LinkNoReturn` is raised, the link will be
        removed from the :mod:`selenium` database through
        :func:`~darc.db.drop_selenium`.

    If successful, the rendered source HTML document will be saved, and a
    full-page screenshot will be taken and saved.

    .. note::

       When taking full-page screenshot, :func:`~darc.crawl.loader` will
       use :javascript:`document.body.scrollHeight` to get the total
       height of web page. If the page height is *less than* **1,000 pixels**,
       then :mod:`darc` will by default set the height as **1,000 pixels**.

       Later :mod:`darc` will tell :mod:`selenium` to resize the window (in
       *headless* mode) to **1,024 pixels** in width and **110%** of the
       page height in height, and take a *PNG* screenshot.

    If the submission API is provided, :func:`~darc.submit.submit_selenium`
    will be called and submit the document just loaded.

    Later, :func:`~darc.parse.extract_links` will be called then to
    extract all possible links from the HTML document and save such
    links into the :mod:`requests` database (c.f. :func:`~darc.db.save_requests`).

    .. seealso::

       * :data:`darc.const.SE_EMPTY`
       * :data:`darc.const.SE_WAIT`

    """
    logger.info('[SELENIUM] Loading %s', link.url)
    try:
        # timestamp
        timestamp = datetime.now()

        # retrieve source from Chrome
        with request_driver(link) as driver:
            try:
                # selenium driver hook
                driver = loader_hook(timestamp, driver, link)
            except urllib3_exceptions.HTTPError:
                logger.pexc(message=f'[SELENIUM] Fail to load {link.url}')
                save_selenium(link, single=True)
                return
            except selenium_exceptions.WebDriverException as error:
                logger.pexc(message=f'[SELENIUM] Fail to load {link.url}')
                save_selenium(link, single=True)
                return
            except LinkNoReturn as error:
                logger.pexc(LOG_WARNING,
                            f'[SELENIUM] Removing from database: {link.url}')
                if error.drop:
                    drop_selenium(link)
                return

            # get HTML source
            html = driver.page_source

            if html == SE_EMPTY:
                logger.error('[SELENIUM] Empty page from %s', link.url)
                save_selenium(link, single=True)
                return

            screenshot = None
            try:
                # get maximum height
                height = driver.execute_script(
                    'return document.body.scrollHeight')

                # resize window (with some magic numbers)
                driver.set_window_size(1024,
                                       math.ceil(max(height, 1000) * 1.1))

                # take a full page screenshot
                screenshot = driver.get_screenshot_as_base64()
            except Exception:
                logger.pexc(
                    message=
                    f'[SELENIUM] Fail to save screenshot from {link.url}')

            # submit data
            submit_selenium(timestamp, link, html, screenshot)

            # add link to queue
            save_requests(extract_links(link, html), score=0, nx=True)
    except Exception:
        logger.ptb('[Error from %s]', link.url)
        save_selenium(link, single=True)

    logger.info('[SELENIUM] Loaded %s', link.url)