def lint_urls(root, lint_ctx): """Find referenced URLs and verify they are valid.""" urls, docs = find_urls_for_xml(root) # This is from Google Chome 53.0.2785.143, current at time of writing: BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" def validate_url(url, lint_ctx, user_agent=None): is_valid = True if user_agent: req = Request(url, headers={"User-Agent": user_agent}) else: req = url try: handle = urlopen(req) handle.read(100) except HTTPError as e: if e.code == 429: # too many requests pass else: is_valid = False lint_ctx.error("HTTP Error %s accessing %s" % (e.code, url)) except URLError as e: is_valid = False lint_ctx.error("URL Error %s accessing %s" % (str(e), url)) if is_valid: lint_ctx.info("URL OK %s" % url) for url in urls: validate_url(url, lint_ctx) for url in docs: validate_url(url, lint_ctx, BROWSER_USER_AGENT)
def lint_urls(root, lint_ctx): urls = find_urls_for_xml(root) def validate_url(url, lint_ctx): try: handle = urllib2.urlopen(url) handle.read(100) lint_ctx.info("URL OK %s" % url) except urllib2.HTTPError as e: lint_ctx.error("HTTP Error %s accessing %s" % (e.code, url)) except urllib2.URLError as e: lint_ctx.error("URL Error %s accessing %s" % (str(e), url)) for url in urls: validate_url(url, lint_ctx)
def lint_urls(root, lint_ctx): """Find referenced URLs and verify they are valid.""" urls, docs = find_urls_for_xml(root) # This is from Google Chome on macOS, current at time of writing: BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36" def validate_url(url, lint_ctx, user_agent=None): is_valid = True if url.startswith('http://') or url.startswith('https://'): if user_agent: headers = {"User-Agent": user_agent, 'Accept': '*/*'} else: headers = None r = None try: r = requests.get(url, headers=headers, stream=True) r.raise_for_status() next(r.iter_content(1000)) except Exception as e: if r is not None and r.status_code == 429: # too many requests pass if r is not None and r.status_code == 403 and 'cloudflare' in r.text: # CloudFlare protection block pass else: is_valid = False lint_ctx.error("Error '%s' accessing %s" % (e, url)) else: try: with urlopen(url) as handle: handle.read(100) except Exception as e: is_valid = False lint_ctx.error("Error '%s' accessing %s" % (e, url)) if is_valid: lint_ctx.info("URL OK %s" % url) for url in urls: validate_url(url, lint_ctx) for url in docs: validate_url(url, lint_ctx, BROWSER_USER_AGENT)
def lint_urls(root, lint_ctx): """Find referenced URLs and verify they are valid.""" urls, docs = find_urls_for_xml(root) # This is from Google Chome 53.0.2785.143, current at time of writing: BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36" def validate_url(url, lint_ctx, user_agent=None): is_valid = True if url.startswith('http://') or url.startswith('https://'): if user_agent: headers = {"User-Agent": user_agent, 'Accept': '*/*'} else: headers = None r = None try: r = requests.get(url, headers=headers, stream=True) r.raise_for_status() next(r.iter_content(1000)) except Exception as e: if r and r.status_code == 429: # too many requests pass else: is_valid = False lint_ctx.error("Error '%s' accessing %s" % (e, url)) else: try: handle = urlopen(url) handle.read(100) except Exception as e: is_valid = False lint_ctx.error("Error '%s' accessing %s" % (e, url)) if is_valid: lint_ctx.info("URL OK %s" % url) for url in urls: validate_url(url, lint_ctx) for url in docs: validate_url(url, lint_ctx, BROWSER_USER_AGENT)