def test_grouped_ua_ends_with_rule(self): robots_txt = "User-agent: *\nDisallow: /\nUser-agent: Google\nUser-agent: Bingbot\nAllow: /" ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" url = "https://www.example.com/test/is/allowed.html" robot = GFlareRobots(robots_txt, user_agent=ua) self.assertEqual(robot.is_allowed(url), True, "Should be allowed")
def test_specificity_two(self): robots_txt = "User-agent: *\nDisallow: /test/corner/\nAllow: /test/\nDisallow: /test/is/" ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" url = "https://www.example.com/test/corner/funpart.html" robot = GFlareRobots(robots_txt, user_agent=ua) self.assertEqual(robot.is_allowed(url), False, "Should be disallowed")
def test_ua_override(self): robots_txt = "User-agent: *\nAllow: /test*\nUser-agent: Google\nDisallow: /" ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" url = "https://www.example.com/test/is/disallowed.html" robot = GFlareRobots(robots_txt, user_agent=ua) self.assertEqual(robot.is_allowed(url), False, "Should be disallowed")
def test_least_restrictive(self): robots_txt = "User-agent: *\nDisallow: /test*\nAllow: /test/" ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" allowed_url = "https://www.example.com/test/is/allowed.html" robot = GFlareRobots(robots_txt, user_agent=ua) self.assertEqual(robot.is_allowed(allowed_url), True, "Should be allowed")
def test_broken_robots_txt(self): robots_txt = "User-agent: *Allow: /\n\nDisallow: /test/is/disallowed\nUser-agent: Yandex\nDisallow: /*test" ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" url = "https://www.example.com/test/is/disallowed.html" robot = GFlareRobots(robots_txt, user_agent=ua) self.assertEqual(robot.is_allowed(url), True, "Should be allowed")
def test_ua_as_submatch(self): robots_txt = "User-agent: *\nAllow: /\nUser-agent: Google\nDisallow: /test/" ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" allowed_url = "https://www.example.com/is/allowed.html" disallowed_url = "https://www.example.com/test/is/not/allowed.html" robot = GFlareRobots(robots_txt, user_agent=ua) self.assertEqual(robot.is_allowed(allowed_url), True, "Should be allowed") self.assertEqual(robot.is_allowed(disallowed_url), False, "Should be disallowed")
def test_rogue_sitemap_entry(self): robots_txt = "User-agent: *\nAllow: /allowed/section\nSitemap: https://www.example.com/sitemap.xml\nDisallow: /disallowed/section\nDisallow: /*section" ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" allowed_url = "https://www.example.com/allowed/section" disallowed_url = "https://www.example.com/disallowed/section" robot = GFlareRobots(robots_txt, user_agent=ua) self.assertEqual(robot.is_allowed(allowed_url), True, "Should be allowed") self.assertEqual(robot.is_allowed(disallowed_url), False, "Should be disallowed")
def __init__(self, settings, columns): self.settings = settings self.all_items = columns self.response = None self.url = '' self.base_url = '' self.host = '' self.robots_txt_ua = "Googlebot" self.gfrobots = GFlareRobots('', self.settings.get("USER_AGENT", '')) self.spider_links = "Spider" in self.settings.get("MODE", "") self.extraction_separator = self.settings.get('EXTRACTION_SEPARATOR', '; ') self.xpath_mapping = { 'href_all': '//a/@href', 'href_respect_nofollow': '//a[not(contains(@rel, "nofollow"))]/@href', 'canonical_tag': '/html/head/link[@rel="canonical"]/@href', 'hreflang': '/html/head/link[@rel="alternate" and @hreflang]/@href', 'pagination': '/html/head/link[@rel="next"]/@href|/html/head/link[@rel="prev"]/@href', 'images': '//img/@src', 'stylesheets': '//link[@rel="stylesheet"]/@href', 'javascript': '//script/@src', 'h1': '//h1/text()', 'h2': '//h2/text()', 'page_title': '/html/head/title/text()', 'meta_description': '/html/head/meta[@name="description"]/@content', 'base_url': '/html/head/base/@href' } self.xpath_link_extraction = self.get_link_extraction_xpath() self.exclusions_regex = self.exclusions_to_regex( self.settings.get('EXCLUSIONS', [])) self.crawlable_schemes = ('http', 'https', '')
def __init__(self, settings, columns): self.settings = settings self.all_items = columns self.response = None self.url = None self.url_components = None self.robots_txt_ua = "Googlebot" self.gfrobots = GFlareRobots('', self.settings.get("USER_AGENT", '')) self.robots_txt_status = None self.spider_links = "Spider" in self.settings.get("MODE", "") if self.robots_txt_status == "BLOCKED" and 'respect_robots_txt' in self.settings.get( 'CRAWL_ITEMS', ''): self.spider_links = False self.extraction_separator = self.settings.get('EXTRACTION_SEPARATOR', '; ') self.xpath_mapping = { 'canonical_tag': '/html/head/link[@rel="canonical"]/@href', 'hreflang': '/html/head/link[@rel="alternate"]/@href', 'pagination': '/html/head/link[@rel="next"]/@href|//link[@rel="prev"]/@href', 'images': '//img/@src', 'stylesheets': '//link[@rel="stylesheet"]/@href', 'javascript': '//script/@src', 'h1': '//h1/text()', 'h2': '//h2/text()', 'page_title': '/html/head/title/text()', 'meta_description': '/html/head/meta[@name="description"]/@content' } self.xpath_link_extraction = self.get_link_extraction_xpath() self.exclusions_regex = self.exclusions_to_regex( self.settings.get('EXCLUSIONS', []))
class GFlareResponse: def __init__(self, settings, columns): self.settings = settings self.all_items = columns self.response = None self.url = '' self.base_url = '' self.host = '' self.robots_txt_ua = "Googlebot" self.gfrobots = GFlareRobots('', self.settings.get("USER_AGENT", '')) self.spider_links = "Spider" in self.settings.get("MODE", "") self.extraction_separator = self.settings.get('EXTRACTION_SEPARATOR', '; ') self.xpath_mapping = { 'href_all': '//a/@href', 'href_respect_nofollow': '//a[not(contains(@rel, "nofollow"))]/@href', 'canonical_tag': '/html/head/link[@rel="canonical"]/@href', 'hreflang': '/html/head/link[@rel="alternate" and @hreflang]/@href', 'pagination': '/html/head/link[@rel="next"]/@href|/html/head/link[@rel="prev"]/@href', 'images': '//img/@src', 'stylesheets': '//link[@rel="stylesheet"]/@href', 'javascript': '//script/@src', 'h1': '//h1/text()', 'h2': '//h2/text()', 'page_title': '/html/head/title/text()', 'meta_description': '/html/head/meta[@name="description"]/@content', 'base_url': '/html/head/base/@href' } self.xpath_link_extraction = self.get_link_extraction_xpath() self.exclusions_regex = self.exclusions_to_regex( self.settings.get('EXCLUSIONS', [])) self.crawlable_schemes = ('http', 'https', '') def timing(f): @wraps(f) def wrap(*args, **kw): ts = time() result = f(*args, **kw) te = time() print(f'func:{f.__name__} took: {te - ts}') return result return wrap def set_response(self, response): self.response = response self.url = self.response.url self.host = self.get_domain(self.url) self.spider_links = "Spider" in self.settings.get("MODE", "") if self.get_robots_txt_status == 'blocked' and 'respect_robots_txt' in self.settings.get( 'CRAWL_ITEMS', ''): self.spider_links = False if self.is_robots_txt(): self.response_to_robots_txt() def response_to_robots_txt(self): if self.response.status_code == 200: self.robots_txt = self.response.text self.gfrobots.set_robots_txt(self.robots_txt, user_agent=self.settings.get( "USER_AGENT", '')) self.robots_txt_ua = self.gfrobots.get_short_ua( self.settings.get("USER_AGENT", '')) def get_initial_url(self): if len(self.response.history) == 0: return str(self.response.url).strip() return str(self.response.history[0].url).strip() def get_link_extraction_xpath(self): xpaths = [] crawl_items = self.settings['CRAWL_ITEMS'] if not 'respect_nofollow' in crawl_items: xpaths.append(self.xpath_mapping['href_all']) else: xpaths.append(self.xpath_mapping['href_respect_nofollow']) if 'canonical_tag' in crawl_items: xpaths.append(self.xpath_mapping['canonical_tag']) if 'hreflang' in crawl_items: xpaths.append(self.xpath_mapping['hreflang']) if 'pagination' in crawl_items: xpaths.append(self.xpath_mapping['pagination']) if 'images' in crawl_items: xpaths.append(self.xpath_mapping['images']) if 'stylesheets' in crawl_items: xpaths.append(self.xpath_mapping['stylesheets']) if 'javascript' in crawl_items: xpaths.append(self.xpath_mapping['javascript']) return '|'.join(xpaths) def get_data(self): d = {'url': self.url} d['data'] = self.get_header_info() if len(self.response.content) > 0: self.tree = self.get_tree() self.base_url = self.get_base_url() if self.spider_links: d['links'] = self.extract_links() d['data'] = {**d['data'], **self.get_crawl_data()} d['data'] = { **d['data'], **{ 'crawl_status': self.get_full_status(self.url, d['data']) } } d['data'] = [self.dict_to_row(d['data'])] if self.has_redirected(): d['data'] += self.get_redirects() return d def get_tree(self): try: # We need to use page.content rather than page.text because # html.fromstring implicitly expects bytes as input. return fromstring(self.response.content) except Exception as e: print('Error parsing', self.url, 'with lxml') print(e) def get_domain(self, url): try: _, _, domain, _, _, _, _ = parse_url(url) except: return '' if not domain: return '' if domain.startswith('www.'): return domain.replace('www.', '') return domain def get_robots_txt_url(self, url): comps = parse_url(url) url = requote_uri( urlunparse([ comps.scheme, comps.host, 'robots.txt', None, comps.query, comps.fragment ])) return url def is_external(self, url): if self.settings.get('MODE') == 'List': return False domain = self.get_domain(url) if not domain: return False return domain != self.settings.get('ROOT_DOMAIN', '') def is_excluded(self, url): if self.exclusions_regex: return bool(match(self.exclusions_regex, url)) return False def get_base_url(self) -> str: extraction = self.extract_xpath(self.xpath_mapping['base_url']) if extraction: return self.sanitise_url(extraction[0], base_url=self.url) return self.url def exclusions_to_regex(self, exclusions): rules = [] for exclusion in exclusions: operator, value = exclusion if operator == 'Equal to (=)': value = escape(value) rules.append(f"^{value}$") elif operator == 'Contain': value = escape(value) rules.append(f".*{value}.*") elif operator == 'Start with': value = escape(value) rules.append(f"^{value}.*") elif operator == 'End with': value = escape(value) rules.append(f".*{value}$") elif operator == 'Regex match': rules.append(value) return '|'.join(rules) def is_robots_txt(self, url=None): if not url: url = self.url if self.is_external(url): return False return parse_url(url).path == '/robots.txt' def get_final_url(self): return self.response.url def get_text(self): return self.response.text def get_canonical_http_header(self): header = self.response.headers.get("Link", "") if "rel=" in header: return header.split(";")[0].replace("<", "").replace(">", "") return "" def get_header_info(self): header = { 'url': self.url, 'status_code': self.response.status_code, 'content_type': self.response.headers.get('content-type', ''), 'robots_txt': self.get_robots_txt_status(self.url), 'x_robots_tag': self.response.headers.get('x-robots-tag', ''), 'canonical_http': self.get_canonical_http_header() } return header def valid_url(self, url): try: cmps = parse_url(url) if cmps.scheme and not cmps.host: return False if not cmps.scheme and cmps.host: return False except: return False if cmps.scheme and not cmps.scheme in self.crawlable_schemes: return False # Filter out external links if needed if self.settings.get('MODE') != 'List': if "external_links" not in self.settings.get( "CRAWL_ITEMS", "") and self.is_external(url): return False if self.is_excluded(url): return False # Do not check and report on on-page links if "check_blocked_urls" not in self.settings.get( "CRAWL_ITEMS", "") and self.allowed_by_robots_txt(url) == False: return False return True def sanitise_url(self, url: str, base_url='') -> str: """Cleans a given input URL and returns a RFC compliant URL as a string.""" if isinstance(url, bytes): url = url.decode('utf8') else: url = str(url) # Remove leading whitespaces from url url = url.lstrip() if base_url: url = urljoin(base_url, url) try: scheme, auth, host, port, path, query, fragment = parse_url(url) except: return None # Carefully reconstruct the network location netloc = auth or '' if netloc: netloc += '@' netloc += host if port: # Only report on ports if they are used in a non-standard way if scheme == 'http' and port == 80: pass elif scheme == 'https' and port == 443: pass else: netloc += ':' + str(port) # Bare domains aren't valid URLs. if not path: path = '/' url = requote_uri( urlunparse([scheme, netloc, path, None, query, fragment])) # Search engines ignore hash fragments hence we remove them from URLs url = url.split('#')[0] return url def extract_links(self): links = [ self.sanitise_url(url, base_url=self.base_url) for url in self.extract_xpath(self.xpath_link_extraction) if self.valid_url(url) ] return list(set(links)) def get_txt_by_selector(self, selector, method="css", get="txt"): try: if method == "css": tree_result = self.tree.cssselect(selector) elif method == "xpath": tree_result = self.tree.xpath(selector) else: pass txt = "" if len(tree_result) > 0: if get == "href": txt = tree_result[0].attrib['href'] elif get != "txt": txt = tree_result[0].get(get) else: txt = tree_result[0].text_content() if txt == None: return "" return ' '.join(txt.split()) except: print(f"{selector} failed") return "" def extract_onpage_elements(self): d = {} if 'h1' in self.all_items: d['h1'] = self.extraction_separator.join( self.clean_list(self.extract_xpath(self.xpath_mapping['h1']))) if 'h2' in self.all_items: d['h2'] = self.extraction_separator.join( self.clean_list(self.extract_xpath(self.xpath_mapping['h2']))) if 'page_title' in self.all_items: d['page_title'] = self.extraction_separator.join( self.clean_list( self.extract_xpath(self.xpath_mapping['page_title']))) if 'meta_description' in self.all_items: d['meta_description'] = self.extraction_separator.join( self.clean_list( self.extract_xpath( self.xpath_mapping['meta_description']))) return d def extract_directives(self): d = {} if 'canonical_tag' in self.all_items: canonicals = self.extract_xpath( self.xpath_mapping['canonical_tag']) if len(canonicals) > 0: d['canonical_tag'] = self.sanitise_url(canonicals[0], base_url=self.base_url) else: d['canonical_tag'] = '' if 'canonical_http_header' in self.all_items: d['canonical_http_header'] = self.get_canonical_http_header() if 'meta_robots' in self.all_items: all_fields = self.get_meta_name_fields() matching_ua = [ f for f in all_fields if f.lower() in self.robots_txt_ua.lower() ] rules = [] if len(matching_ua) > 0: ua = matching_ua[0] rules = self.extract_xpath(f'//meta[@name="{ua}"]/@content') rules += self.extract_xpath('//meta[@name="robots"]/@content') d['meta_robots'] = ', '.join(rules) return d def custom_extractions(self): for extraction_name, selector, value in self.settings.get( 'EXTRACTIONS', []): if selector == 'CSS Selector': return { extraction_name: self.get_txt_by_selector(value, method='css', get='txt') } elif selector == 'XPath': return { extraction_name: self.extraction_separator.join( self.clean_list(self.extract_xpath(value))) } else: print('WARNING: regex extraction is not implemented yet') return {extraction_name: ''} return {} def get_crawl_data(self): return { **self.extract_onpage_elements(), **self.extract_directives(), **self.custom_extractions() } def is_canonicalised(self, url, canonical): if not canonical: return False if canonical != url: return True return False def get_full_status(self, url, seo_items): status = [] # Evaluate status code try: code_description = status_codes._codes[ seo_items['status_code']][0].replace('_', ' ') except KeyError: code_description = 'non-standard response' status.append(code_description) # Check against X-Robots # No checking against User-Agents is done # As the following setup can not be evaluated: # X-Robots-Tag: bingbot: noindex # X-Robots-Tag: nofollow, nosnippet # response.headers['X-Robots-Tag'] would return a combined result # 'X-Robots-Tag': 'bingbot: noindex, nofun, norisk, nofollow, nosnippet' # Which CANNOT be deconstructed again # This is actually compliant to RFC2616 # https://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.2 if 'noindex' in seo_items.get('x_robots_tag', ''): status.append('blocked by x-robots-tag') # Check against robots.txt if 'blocked' in seo_items.get('robots_txt', ''): status.append('blocked by robots.txt') # Check against meta robots.txt if 'noindex' in seo_items.get('meta_robots', ''): status.append('noindex') # Canonical Tag if self.is_canonicalised(url, seo_items.get('canonical_tag', '')): status.append('canonicalised') # Canonical Header if self.is_canonicalised(url, seo_items.get('canonical_http_header', '')): status.append('header canonicalised') # Avoid ok, blocked by robots.txt and show blocked by robots.txt # instead if len(status) != 1 and status[0] == 'ok': status.pop(0) return ', '.join(status) def get_meta_name_fields(self): fields = [] try: fields = self.tree.xpath('//meta/@name') except: pass return fields def dict_to_row(self, data): out = tuple(data.get(item, "") for item in self.all_items) return out def has_redirected(self): return len(self.response.history) > 0 def get_redirects(self): data = [] hist = self.response.history for i in range(len(hist)): hob_url = self.sanitise_url(hist[i].url) if 'external_links' not in self.settings.get('CRAWL_ITEMS', ''): if self.is_external(hob_url): break robots_status = self.get_robots_txt_status(hob_url) if 'respect_robots_txt' in self.settings.get( 'CRAWL_ITEMS', '' ) and 'follow_blocked_redirects' not in self.settings.get( 'CRAWL_ITEMS', '') and robots_status == 'blocked': continue if i + 1 < len(hist): redirect_to_url = self.sanitise_url(str(hist[i + 1].url)) else: redirect_to_url = self.get_final_url() hob_data = { "url": hob_url, "content_type": hist[i].headers.get('Content-Type', ""), 'status_code': hist[i].status_code, 'x_robots_tag': hist[i].headers.get('X-Robots-Tag', ''), 'redirect_url': redirect_to_url, 'robots_txt': robots_status } hob_data['crawl_status'] = self.get_full_status(hob_url, hob_data) hob_row = self.dict_to_row(hob_data) data.append(hob_row) return data def allowed_by_robots_txt(self, url): return self.gfrobots.is_allowed(url) def get_robots_txt_status(self, url): if self.allowed_by_robots_txt(url): return "allowed" return "blocked" def extract_xpath(self, path): try: return self.tree.xpath(path) except: return [] def clean_list(self, inp): try: return [' '.join(i.split()) for i in inp if i.strip()] except Exception as e: print(f'ERROR: cleaning list {inp} failed!') return inp def get_hreflang_links(self): return self.extract_xpath(self.xpath_mapping['hreflang']) def get_canonical_links(self): return self.extract_xpath(self.xpath_mapping['canonical_tag']) def get_pagination_links(self): return self.extract_xpath(self.xpath_mapping['pagination'])
class GFlareResponse: def __init__(self, settings, columns): self.settings = settings self.all_items = columns self.response = None self.url = None self.url_components = None self.robots_txt_ua = "Googlebot" self.gfrobots = GFlareRobots('', self.settings.get("USER_AGENT", '')) self.robots_txt_status = None self.spider_links = "Spider" in self.settings.get("MODE", "") if self.robots_txt_status == "BLOCKED" and 'respect_robots_txt' in self.settings.get( 'CRAWL_ITEMS', ''): self.spider_links = False self.extraction_separator = self.settings.get('EXTRACTION_SEPARATOR', '; ') self.xpath_mapping = { 'canonical_tag': '/html/head/link[@rel="canonical"]/@href', 'hreflang': '/html/head/link[@rel="alternate"]/@href', 'pagination': '/html/head/link[@rel="next"]/@href|//link[@rel="prev"]/@href', 'images': '//img/@src', 'stylesheets': '//link[@rel="stylesheet"]/@href', 'javascript': '//script/@src', 'h1': '//h1/text()', 'h2': '//h2/text()', 'page_title': '/html/head/title/text()', 'meta_description': '/html/head/meta[@name="description"]/@content' } self.xpath_link_extraction = self.get_link_extraction_xpath() self.exclusions_regex = self.exclusions_to_regex( self.settings.get('EXCLUSIONS', [])) def timing(f): @wraps(f) def wrap(*args, **kw): ts = time() result = f(*args, **kw) te = time() print(f'func:{f.__name__} took: {te - ts}') return result return wrap def set_response(self, response): self.response = response # requests.get() encodes spaces within the path with %25 # If we encode the path beforehand, request.get() will double encode the path again resulting in the generation of endless new urls # we need to decode the path back to what it was before request.get() # encoded it self.url = self.url_components_to_str( self.parse_url(self.unencode_url(self.response.url))) if self.is_robots_txt(): self.response_to_robots_txt() def response_to_robots_txt(self): if self.response.status_code == 200: self.robots_txt = self.response.text self.gfrobots.set_robots_txt(self.robots_txt, user_agent=self.settings.get( "USER_AGENT", '')) self.robots_txt_ua = self.gfrobots.get_short_ua( self.settings.get("USER_AGENT", '')) def get_initial_url(self): if len(self.response.history) == 0: return str(self.response.url).strip() return str(self.response.history[0].url).strip() def get_link_extraction_xpath(self): xpaths = [] xpaths.append('//a/@href') crawl_items = self.settings['CRAWL_ITEMS'] if 'canonical_tag' in crawl_items: xpaths.append(self.xpath_mapping['canonical_tag']) if 'hreflang' in crawl_items: xpaths.append(self.xpath_mapping['hreflang']) if 'pagination' in crawl_items: xpaths.append(self.xpath_mapping['pagination']) if 'images' in crawl_items: xpaths.append(self.xpath_mapping['images']) if 'stylesheets' in crawl_items: xpaths.append(self.xpath_mapping['stylesheets']) if 'javascript' in crawl_items: xpaths.append(self.xpath_mapping['javascript']) return '|'.join(xpaths) # @timing def get_data(self): self.url_components = urllib.parse.urlsplit(self.url) d = {'url': self.url} d['data'] = self.get_header_info() if len(self.response.content) > 0: self.tree = self.get_tree() if self.spider_links: d['links'] = self.extract_links() d['data'] = {**d['data'], **self.get_crawl_data()} d['data'] = { **d['data'], **{ 'crawl_status': self.get_full_status(self.url, d['data']) } } d['data'] = [self.dict_to_row(d['data'])] if self.has_redirected(): d['data'] += self.get_redirects() return d # @timing def get_tree(self): try: # We need to use page.content rather than page.text because # html.fromstring implicitly expects bytes as input. return fromstring(self.response.content) except Exception as e: print("Error parsing", self.url, "with lxml") print(e) def parse_url(self, url): try: scheme, netloc, path, query, frag = urllib.parse.urlsplit( url.strip()) except: print(f'Error parsing {url}') return { "scheme": '', "netloc": '', "path": '', "query": '', "frag": '' } if not scheme and not netloc: # Hack needed as non RFC tel references are not detected by # urlsplit if path.startswith("tel:"): path.replace("tel:", "") scheme = "tel" else: absolute_url = urllib.parse.urljoin(self.url, url) scheme, netloc, path, query, frag = urllib.parse.urlsplit( absolute_url) if ':' in netloc: if scheme == 'https' and ':443' in netloc: netloc = netloc.replace(':443', '') elif scheme == 'http' and ':80' in netloc: netloc = netloc.replace(':80', '') return { "scheme": scheme, "netloc": netloc, "path": path.strip(), "query": query, "frag": frag } def url_components_to_str(self, comp): url = str( urllib.parse.urlunsplit((comp["scheme"], comp["netloc"], comp["path"], comp["query"], ""))) if comp['path'] == '': url += '/' return url def unencode_url(self, url): parsed = self.parse_url(url) parsed["path"] = urllib.parse.unquote(parsed["path"]) return self.url_components_to_str(parsed) def get_domain(self, url): domain = self.parse_url(url)["netloc"] if "www." in domain: return domain.replace("www.", "") return domain def get_robots_txt_url(self, url): comps = self.parse_url(url) comps["path"] = "robots.txt" return self.url_components_to_str(comps) def is_external(self, url): if self.settings.get("ROOT_DOMAIN", "") == "": return False return self.get_domain(url) != self.settings.get("ROOT_DOMAIN", "") def is_excluded(self, url): if self.exclusions_regex: return bool(match(self.exclusions_regex, url)) return False def exclusions_to_regex(self, exclusions): rules = [] for exclusion in exclusions: operator, value = exclusion if operator == 'Equal to (=)': value = escape(value) rules.append(f"^{value}$") elif operator == 'Contain': value = escape(value) rules.append(f".*{value}.*") elif operator == 'Start with': value = escape(value) rules.append(f"^{value}.*") elif operator == 'End with': value = escape(value) rules.append(f".*{value}$") elif operator == 'Regex match': rules.append(value) return '|'.join(rules) def is_robots_txt(self, url=None): if not url: url = self.url if self.is_external(url): return False return self.parse_url(url)["path"] == "/robots.txt" def get_final_url(self): return self.url_components_to_str(self.parse_url(self.response.url)) def get_text(self): return self.response.text def get_canonical_http_header(self): header = self.response.headers.get("Link", "") if "rel=" in header: return header.split(";")[0].replace("<", "").replace(">", "") return "" def get_header_info(self): header = { 'url': self.url, 'status_code': self.response.status_code, 'content_type': self.response.headers.get('content-type', ''), 'robots_txt': self.get_robots_txt_status(self.url), 'x_robots_tag': self.response.headers.get('x-robots-tag', ''), 'canonical_http': self.get_canonical_http_header() } return header def valid_url(self, components): if not "http" in components['scheme']: return False url = self.url_components_to_str(components) if ' ' in url: return False # Filter out external links if needed if "external_links" not in self.settings.get( "CRAWL_ITEMS", "") and self.is_external(url): return False if self.is_excluded(url): return False # Do not check and report on on-page links if "check_blocked_urls" not in self.settings.get( "CRAWL_ITEMS", "") and self.allowed_by_robots_txt(url) == False: return False return True # @timing def extract_links(self): parsed_links = [ self.parse_url(l) for l in self.extract_xpath(self.xpath_link_extraction) ] links = list( set([ self.url_components_to_str(l) for l in parsed_links if self.valid_url(l) ])) return links def get_txt_by_selector(self, selector, method="css", get="txt"): try: if method == "css": tree_result = self.tree.cssselect(selector) elif method == "xpath": tree_result = self.tree.xpath(selector) else: pass txt = "" if len(tree_result) > 0: if get == "href": txt = tree_result[0].attrib['href'] elif get != "txt": txt = tree_result[0].get(get) else: txt = tree_result[0].text_content() if txt == None: return "" return ' '.join(txt.split()) except: print(f"{selector} failed") return "" def extract_onpage_elements(self): d = {} if 'h1' in self.all_items: d['h1'] = self.extraction_separator.join( self.clean_list(self.extract_xpath(self.xpath_mapping['h1']))) if 'h2' in self.all_items: d['h2'] = self.extraction_separator.join( self.clean_list(self.extract_xpath(self.xpath_mapping['h2']))) if 'page_title' in self.all_items: d['page_title'] = self.extraction_separator.join( self.clean_list( self.extract_xpath(self.xpath_mapping['page_title']))) if 'meta_description' in self.all_items: d['meta_description'] = self.extraction_separator.join( self.clean_list( self.extract_xpath( self.xpath_mapping['meta_description']))) return d def extract_directives(self): d = {} if 'canonical_tag' in self.all_items: canonicals = self.extract_xpath( self.xpath_mapping['canonical_tag']) if len(canonicals) > 0: d['canonical_tag'] = canonicals[0] else: d['canonical_tag'] = '' if 'canonical_http_header' in self.all_items: d['canonical_http_header'] = self.get_canonical_http_header() if 'meta_robots' in self.all_items: all_fields = self.get_meta_name_fields() matching_ua = [ f for f in all_fields if f.lower() in self.robots_txt_ua.lower() ] rules = [] if len(matching_ua) > 0: ua = matching_ua[0] rules = self.extract_xpath(f'//meta[@name="{ua}"]/@content') rules += self.extract_xpath('//meta[@name="robots"]/@content') d['meta_robots'] = ', '.join(rules) return d def custom_extractions(self): for extraction_name, selector, value in self.settings.get( 'EXTRACTIONS', []): if selector == 'CSS Selector': return { extraction_name: self.get_txt_by_selector(value, method='css', get='txt') } elif selector == 'XPath': return { extraction_name: self.extraction_separator.join( self.clean_list(self.extract_xpath(value))) } else: print('WARNING: regex extraction is not implemented yet') return {extraction_name: ''} return {} def get_crawl_data(self): return { **self.extract_onpage_elements(), **self.extract_directives(), **self.custom_extractions() } def is_canonicalised(self, url, canonical): if not canonical: return False if self.url_components_to_str( self.parse_url(canonical)) != self.url_components_to_str( self.parse_url(url)): return True return False def get_full_status(self, url, seo_items): status = [] # Evaluate status code try: code_description = status_codes._codes[ seo_items['status_code']][0].replace('_', ' ') except KeyError: code_description = 'non-standard response' status.append(code_description) # Check against X-Robots # No checking against User-Agents is done # As the following setup can not be evaluated: # X-Robots-Tag: bingbot: noindex # X-Robots-Tag: nofollow, nosnippet # response.headers['X-Robots-Tag'] would return a combined result # 'X-Robots-Tag': 'bingbot: noindex, nofun, norisk, nofollow, nosnippet' # Which CANNOT be deconstructed again # This is actually compliant to RFC2616 # https://www.w3.org/Protocols/rfc2616/rfc2616-sec4.html#sec4.2 if 'noindex' in seo_items.get('x_robots_tag', ''): status.append('blocked by x-robots-tag') # Check against robots.txt if 'blocked' in seo_items.get('robots_txt', ''): status.append('blocked by robots.txt') # Check against meta robots.txt if 'noindex' in seo_items.get('meta_robots', ''): status.append('noindex') # Canonical Tag if self.is_canonicalised(url, seo_items.get('canonical_tag', '')): status.append('canonicalised') # Canonical Header if self.is_canonicalised(url, seo_items.get('canonical_http_header', '')): status.append('header canonicalised') # Avoid ok, blocked by robots.txt and show blocked by robots.txt # instead if len(status) != 1 and status[0] == 'ok': status.pop(0) return ', '.join(status) def get_meta_name_fields(self): fields = [] try: fields = self.tree.xpath('//meta/@name') except: pass return fields def dict_to_row(self, data): out = tuple(data.get(item, "") for item in self.all_items) return out def has_redirected(self): return len(self.response.history) > 0 # @timing def get_redirects(self): data = [] hist = self.response.history if len(hist) > 0: for i in range(len(hist)): hob_url = self.url_components_to_str( self.parse_url(hist[i].url)) if 'external_links' not in self.settings.get( 'CRAWL_ITEMS', ''): if self.is_external(hob_url): break robots_status = self.get_robots_txt_status(hob_url) if 'respect_robots_txt' in self.settings.get( 'CRAWL_ITEMS', '' ) and 'follow_blocked_redirects' not in self.settings.get( 'CRAWL_ITEMS', '') and robots_status == 'blocked': continue if i + 1 < len(hist): redirect_to_url = self.url_components_to_str( self.parse_url(str(hist[i + 1].url).strip())) else: redirect_to_url = self.get_final_url() hob_data = { "url": hob_url, "content_type": hist[i].headers.get('Content-Type', ""), 'status_code': hist[i].status_code, 'x_robots_tag': hist[i].headers.get('X-Robots-Tag', ''), 'redirect_url': redirect_to_url, 'robots_txt': robots_status } hob_data['crawl_status'] = self.get_full_status( hob_url, hob_data) hob_row = self.dict_to_row(hob_data) data.append(hob_row) return data def allowed_by_robots_txt(self, url): return self.gfrobots.is_allowed(url) def get_robots_txt_status(self, url): if self.allowed_by_robots_txt(url): return "allowed" return "blocked" def attrib_to_list(self, xpath, attrib): try: return [ self.url_components_to_str(self.parse_url(l.attrib[attrib])) for l in self.tree.xpath(xpath) ] except: return [] def extract_xpath(self, path): try: return self.tree.xpath(path) except: return [] def clean_list(self, inp): try: return [' '.join(i.split()) for i in inp if i.strip()] except Exception as e: print(f'ERROR: cleaning list {inp} failed!') return inp def get_hreflang_links(self): return self.extract_xpath(self.xpath_mapping['hreflang']) def get_canonical_links(self): return self.extract_xpath(self.xpath_mapping['canonical_tag']) def get_pagination_links(self): return self.extract_xpath(self.xpath_mapping['pagination'])
def test_grouped_ua_ends_with_additional_uas_two(self): robots_txt = "User-agent: *\nAllow: /\nUser-agent: Google\n\n\n\nUser-agent: Bingbot\nUser-agent: Greenflare # My own crawler\nDisallow: /test/is/disallowed\nUser-agent: Yandex\nDisallow: /*test" ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" ua_greenflare = "Greenflare SEO Spider/1.0" ua_bingbot = "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)" ua_yandex = "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)" ua_firefox = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0" url = "https://www.example.com/test/is/disallowed.html" robot = GFlareRobots(robots_txt, user_agent=ua) self.assertEqual(robot.is_allowed(url), False, "Should be disallowed") robot = GFlareRobots(robots_txt, user_agent=ua_greenflare) self.assertEqual(robot.is_allowed(url), False, "Should be disallowed") robot = GFlareRobots(robots_txt, user_agent=ua_bingbot) self.assertEqual(robot.is_allowed(url), False, "Should be disallowed") robot = GFlareRobots(robots_txt, user_agent=ua_yandex) self.assertEqual(robot.is_allowed(url), False, "Should be disallowed") robot = GFlareRobots(robots_txt, user_agent=ua_firefox) self.assertEqual(robot.is_allowed(url), True, "Should be allowed")