def read_robots(link: 'darc_link.Link', text: str, host: 'Optional[str]' = None) -> 'List[darc_link.Link]': """Read ``robots.txt`` to fetch link to sitemaps. Args: link: Original link to ``robots.txt``. text: Content of ``robots.txt``. host: Hostname of the URL to ``robots.txt``, the value may not be same as in ``link``. Returns: List of link to sitemaps. Note: If the link to sitemap is not specified in ``robots.txt`` [*]_, the fallback link ``/sitemap.xml`` will be used. .. [*] https://www.sitemaps.org/protocol.html#submit_robots """ rp = RobotFileParser() with io.StringIO(text) as file: rp.parse(file) sitemaps = rp.site_maps() if sitemaps is None: return [parse_link(urljoin(link.url, '/sitemap.xml'), backref=link)] return [parse_link(urljoin(link.url, sitemap), host=host, backref=link) for sitemap in sitemaps]
def read_sitemap(link: 'darc_link.Link', text: str, check: bool = CHECK) -> 'List[darc_link.Link]': """Read sitemap. Args: link: Original link to the sitemap. text: Content of the sitemap. check: If perform checks on extracted links, default to :data:`~darc.const.CHECK`. Returns: List of links extracted. See Also: * :func:`darc.parse._check` * :func:`darc.parse._check_ng` """ soup = bs4.BeautifulSoup(text, 'html5lib') # https://www.sitemaps.org/protocol.html temp_list = [parse_link(urljoin(link.url, loc.text), host=link.host, backref=link) for loc in soup.select('urlset > url > loc')] # check content / proxy type if check: return _check(temp_list) return temp_list
def get_sitemap(link: 'darc_link.Link', text: str, host: 'Optional[str]' = None) -> 'List[darc_link.Link]': """Fetch link to other sitemaps from a sitemap. Args: link: Original link to the sitemap. text: Content of the sitemap. host: Hostname of the URL to the sitemap, the value may not be same as in ``link``. Returns: List of link to sitemaps. Note: As specified in the sitemap protocol, it may contain links to other sitemaps. [*]_ .. [*] https://www.sitemaps.org/protocol.html#index """ sitemaps = [] soup = bs4.BeautifulSoup(text, 'html5lib') # https://www.sitemaps.org/protocol.html#index for loc in soup.select('sitemapindex > sitemap > loc'): sitemaps.append(urljoin(link.url, loc.text)) return [parse_link(sitemap, host=host, backref=link) for sitemap in sitemaps]
def fetch_sitemap(link: 'darc_link.Link', force: bool = False) -> None: """Fetch sitemap. The function will first fetch the ``robots.txt``, then fetch the sitemaps accordingly. Args: link: Link object to fetch for its sitemaps. force: Force refetch its sitemaps. Returns: Contents of ``robots.txt`` and sitemaps. See Also: * :func:`darc.proxy.null.read_robots` * :func:`darc.proxy.null.read_sitemap` * :func:`darc.parse.get_sitemap` """ if force: logger.warning('[ROBOTS] Force refetch %s', link.url) robots_path = None if force else have_robots(link) if robots_path is not None: logger.warning('[ROBOTS] Cached %s', link.url) with open(robots_path) as file: robots_text = file.read() else: robots_link = parse_link(urljoin(link.url, '/robots.txt'), backref=link) logger.info('[ROBOTS] Checking %s', robots_link.url) with request_session(robots_link) as session: try: response = session.get(robots_link.url) except requests.RequestException: logger.pexc(message=f'[ROBOTS] Failed on {robots_link.url}') return if response.ok: ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: logger.error('[ROBOTS] Unresolved content type on %s (%s)', robots_link.url, ct_type) robots_text = '' else: robots_text = response.text save_robots(robots_link, robots_text) logger.info('[ROBOTS] Checked %s', robots_link.url) else: logger.error('[ROBOTS] Failed on %s [%d]', robots_link.url, response.status_code) robots_text = '' if force: logger.warning('[SITEMAP] Force refetch %s', link.url) sitemaps = read_robots(link, robots_text, host=link.host) for sitemap_link in sitemaps: sitemap_path = None if force else have_sitemap(sitemap_link) if sitemap_path is not None: logger.warning('[SITEMAP] Cached %s', sitemap_link.url) with open(sitemap_path) as file: sitemap_text = file.read() else: logger.info('[SITEMAP] Fetching %s', sitemap_link.url) with request_session(sitemap_link) as session: try: response = session.get(sitemap_link.url) except requests.RequestException: logger.pexc(message=f'[SITEMAP] Failed on {sitemap_link.url}') continue if not response.ok: logger.error('[SITEMAP] Failed on %s [%d]', sitemap_link.url, response.status_code) continue # check content type ct_type = get_content_type(response) if ct_type == 'application/gzip': try: sitemap_text = gzip.decompress(response.content).decode() except UnicodeDecodeError: sitemap_text = response.text elif ct_type in ['text/xml', 'text/html']: sitemap_text = response.text save_sitemap(sitemap_link, sitemap_text) else: logger.error('[SITEMAP] Unresolved content type on %s (%s)', sitemap_link.url, ct_type) continue logger.info('[SITEMAP] Fetched %s', sitemap_link.url) # get more sitemaps sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host)) # add link to queue save_requests(read_sitemap(link, sitemap_text))
def fetch_sitemap(link: Link, force: bool = False) -> None: """Fetch sitemap. The function will first fetch the ``robots.txt``, then fetch the sitemaps accordingly. Args: link: Link object to fetch for its sitemaps. force: Force refetch its sitemaps. Returns: Contents of ``robots.txt`` and sitemaps. See Also: * :func:`darc.proxy.null.read_robots` * :func:`darc.proxy.null.read_sitemap` * :func:`darc.parse.get_sitemap` """ if force: print(stem.util.term.format(f'[ROBOTS] Force refetch {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member robots_path = None if force else have_robots(link) if robots_path is not None: print(stem.util.term.format(f'[ROBOTS] Cached {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(robots_path) as file: robots_text = file.read() else: robots_link = parse_link(urljoin(link.url, '/robots.txt')) print(f'[ROBOTS] Checking {robots_link.url}') with request_session(robots_link) as session: try: response = session.get(robots_link.url) except requests.RequestException as error: print(render_error(f'[ROBOTS] Failed on {robots_link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member return if response.ok: ct_type = get_content_type(response) if ct_type not in ['text/text', 'text/plain']: print(render_error(f'[ROBOTS] Unresolved content type on {robots_link.url} ({ct_type})', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member robots_text = '' else: robots_text = response.text save_robots(robots_link, robots_text) print(f'[ROBOTS] Checked {robots_link.url}') else: print(render_error(f'[ROBOTS] Failed on {robots_link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member robots_text = '' if force: print(stem.util.term.format(f'[SITEMAP] Force refetch {link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member sitemaps = read_robots(link, robots_text, host=link.host) for sitemap_link in sitemaps: sitemap_path = None if force else have_sitemap(sitemap_link) if sitemap_path is not None: print(stem.util.term.format(f'[SITEMAP] Cached {sitemap_link.url}', stem.util.term.Color.YELLOW)) # pylint: disable=no-member with open(sitemap_path) as file: sitemap_text = file.read() else: print(f'[SITEMAP] Fetching {sitemap_link.url}') with request_session(sitemap_link) as session: try: response = session.get(sitemap_link.url) except requests.RequestException as error: print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} <{error}>', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member continue if not response.ok: print(render_error(f'[SITEMAP] Failed on {sitemap_link.url} [{response.status_code}]', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member continue # check content type ct_type = get_content_type(response) if ct_type == 'application/gzip': try: sitemap_text = gzip.decompress(response.content).decode() except UnicodeDecodeError: sitemap_text = response.text elif ct_type in ['text/xml', 'text/html']: sitemap_text = response.text save_sitemap(sitemap_link, sitemap_text) else: print(render_error(f'[SITEMAP] Unresolved content type on {sitemap_link.url} ({ct_type})', stem.util.term.Color.RED), file=sys.stderr) # pylint: disable=no-member continue print(f'[SITEMAP] Fetched {sitemap_link.url}') # get more sitemaps sitemaps.extend(get_sitemap(sitemap_link, sitemap_text, host=link.host)) # add link to queue save_requests(read_sitemap(link, sitemap_text))