Пример #1
0
def read_hosts(text: str, check: bool = CHECK) -> typing.List[Link]:
    """Read ``hosts.txt``.

    Args:
        text: Content of ``hosts.txt``.
        check: If perform checks on extracted links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of links extracted.

    """
    temp_list = list()
    for line in filter(None, map(lambda s: s.strip(), text.splitlines())):
        if line.startswith('#'):
            continue

        link = line.split('=', maxsplit=1)[0]
        if I2P_REGEX.fullmatch(link) is None:
            continue
        temp_list.append(parse_link(f'http://{link}'))

    if check:
        return _check(temp_list)
    return temp_list
Пример #2
0
def read_sitemap(link: 'darc_link.Link', text: str, check: bool = CHECK) -> 'List[darc_link.Link]':
    """Read sitemap.

    Args:
        link: Original link to the sitemap.
        text: Content of the sitemap.
        check: If perform checks on extracted links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of links extracted.

    See Also:
        * :func:`darc.parse._check`
        * :func:`darc.parse._check_ng`

    """
    soup = bs4.BeautifulSoup(text, 'html5lib')

    # https://www.sitemaps.org/protocol.html
    temp_list = [parse_link(urljoin(link.url, loc.text), host=link.host, backref=link)
                 for loc in soup.select('urlset > url > loc')]

    # check content / proxy type
    if check:
        return _check(temp_list)
    return temp_list
Пример #3
0
def read_hosts(link: 'darc_link.Link', text: str, check: bool = CHECK) -> 'List[darc_link.Link]':
    """Read ``hosts.txt``.

    Args:
        link: Link object to fetch for its ``hosts.txt``.
        text: Content of ``hosts.txt``.
        check: If perform checks on extracted links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of links extracted.

    """
    temp_list = []
    for line in filter(None, map(lambda s: s.strip(), text.splitlines())):
        if line.startswith('#'):
            continue

        host = line.split('=', maxsplit=1)[0]
        if I2P_REGEX.fullmatch(host) is None:
            continue
        temp_list.append(parse_link(f'http://{host}', backref=link))

    if check:
        return _check(temp_list)
    return temp_list
Пример #4
0
def load_selenium(check: bool = CHECK) -> typing.List[Link]:
    """Load link from the :mod:`selenium` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`selenium` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    See Also:
        * :func:`darc.db._load_selenium_db`
        * :func:`darc.db._load_selenium_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                link_pool = _load_selenium_db()
            except Exception as error:
                warning = warnings.formatwarning(str(error), DatabaseOperaionFailed, __file__, 983,
                                                 '_load_selenium_db()')
                print(render_error(warning, stem.util.term.Color.YELLOW), end='', file=sys.stderr)  # pylint: disable=no-member
                link_pool = list()
    else:
        link_pool = _load_selenium_redis()

    if check:
        link_pool = _check(link_pool)

    if VERBOSE:
        print(stem.util.term.format('-*- [SELENIUM] LINK POOL -*-',
                                    stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(render_error(pprint.pformat(sorted(link.url for link in link_pool)),
                           stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(stem.util.term.format('-' * shutil.get_terminal_size().columns,
                                    stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    return link_pool
Пример #5
0
def load_requests(check: bool = CHECK) -> typing.List[Link]:
    """Load link from the :mod:`requests` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`requests` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    See Also:
        * :func:`darc.db._load_requests_db`
        * :func:`darc.db._load_requests_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            link_pool = _load_requests_db()
    else:
        link_pool = _load_requests_redis()

    if check:
        link_pool = _check(link_pool)

    if VERBOSE:
        print(
            stem.util.term.format('-*- [REQUESTS] LINK POOL -*-',
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(
            render_error(
                pprint.pformat(sorted(link.url for link in link_pool)),
                stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
        print(
            stem.util.term.format('-' * shutil.get_terminal_size().columns,
                                  stem.util.term.Color.MAGENTA))  # pylint: disable=no-member
    return link_pool
Пример #6
0
def load_requests(check: bool = CHECK) -> 'List[Link]':
    """Load link from the :mod:`requests` database.

    Args:
        check: If perform checks on loaded links,
            default to :data:`~darc.const.CHECK`.

    Returns:
        List of loaded links from the :mod:`requests` database.

    Note:
        At runtime, the function will load links with maximum number
        at :data:`~darc.db.MAX_POOL` to limit the memory usage.

    See Also:
        * :func:`darc.db._load_requests_db`
        * :func:`darc.db._load_requests_redis`

    """
    if FLAG_DB:
        with database.connection_context():
            try:
                link_pool = _load_requests_db()
            except Exception:
                logger.pexc(LOG_WARNING,
                            category=DatabaseOperaionFailed,
                            line='_load_requests_db()')
                link_pool = []
    else:
        link_pool = _load_requests_redis()

    if check:
        link_pool = _check(link_pool)

    logger.plog(LOG_VERBOSE,
                '-*- [REQUESTS] LINK POOL -*-',
                object=sorted(link.url for link in link_pool))
    return link_pool
Пример #7
0
    def _extract_links(cls,
                       link: Link,
                       html: typing.Union[str, bytes],
                       check: bool = CHECK) -> typing.List[Link]:
        """Extract links from HTML document.

        Args:
            link: Original link of the HTML document.
            html: Content of the HTML document.
            check: If perform checks on extracted links,
                default to :data:`~darc.const.CHECK`.

        Returns:
            List of extracted links.

        """
        temp_list = cls.extract_links(link, html)
        link_list = [parse_link(link) for link in temp_list]

        # check content / proxy type
        if check:
            return _check(link_list)
        return link_list