class Robotstxt: def __init__(self, configuration: Configuration, configuration_key: str, connection: Connection): if not connection.has_bigquery() and not connection.has_orm(): raise ConfigurationMissingError( 'Missing a database configuration for this operation') self.configuration = configuration self.module_configuration = configuration.operations.get_custom_configuration_operation( configuration_key) self.mongodb = connection.mongodb self.check_service = Check(connection) self.robotsparser = urllib.robotparser.RobotFileParser() def run(self): if len(self.module_configuration.urlsets) > 0: print('Running operation robotstxt:', "\n") if not self.mongodb.has_collection( AggregationRobotstxt.COLLECTION_NAME): return for urlset in self.module_configuration.urlsets: for single_urlset in urlset: urlset_name = urlset[single_urlset] robotstxts = self.mongodb.find( RobotstxtAggregationModule.COLLECTION_NAME, { 'urlset': urlset_name, 'processed_robotstxt': { '$exists': False } }) urlset_config = urlset['checks'] for url in self.configuration.urlsets.urlset_urls( urlset_name): urlstr = str(url) if not urlstr.endswith('/robots.txt'): url = url.protocol + '://' + url.domain + str.rstrip( url.path, '/') + '/robots.txt' for robotstxt in robotstxts: if str(robotstxt['url']) == str(url): print(' + ' + str(robotstxt['url'])) self.check_status_code(robotstxt, urlset_config) self.check_has_sitemap_xml( robotstxt, urlset_config) self.mongodb.update_one( RobotstxtAggregationModule.COLLECTION_NAME, robotstxt['_id'], {'processed_robotstxt': True}) print("\n") def request_url_statuscode(self, url): try: headers = { 'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36' } response = requests.get(url, headers=headers) status_code = response.status_code except requests.RequestException as error: status_code = None return status_code def check_status_code(self, robotstxt: dict, urlset_config: dict): if 'status_code' in urlset_config: assert_val = urlset_config['status_code'] print(' -> check_status_code "' + str(assert_val) + '"', end='') valid = False if 'status_code' in robotstxt: if robotstxt['status_code'] == assert_val: valid = True url = robotstxt['url'] self.check_service.add_check( self.module_configuration.database, robotstxt['urlset'], 'robotstxt-status_code', robotstxt['body'], valid, '', '', url.protocol, url.domain, url.path, url.query, ) def check_has_sitemap_xml(self, robotstxt: dict, urlset_config: dict): if 'has_sitemap_xml' in urlset_config: assert_val_has_sitemap = urlset_config['has_sitemap_xml'] has_sitemap = False if 'body' in robotstxt: robotsbody = robotstxt['body'] self.robotsparser.parse(robotsbody.splitlines()) sitemaps = self.robotsparser.site_maps() if sitemaps: has_sitemap = True valid = False if has_sitemap == assert_val_has_sitemap: valid = True url = robotstxt['url'] self.check_service.add_check( self.module_configuration.database, robotstxt['urlset'], 'robotstxt-has_sitemap_xml', str(url), valid, '', '', url.protocol, url.domain, url.path, url.query, ) if sitemaps: for sitemap in sitemaps: error = '' sitemap_200 = False try: headers = { 'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36' } response = requests.get(sitemap, headers=headers) status_code = response.status_code except requests.RequestException as err: status_code = None if status_code == 200: sitemap_200 = True if not sitemap_200: error = 'No access to sitemap' self.check_service.add_check( self.module_configuration.database, robotstxt['urlset'], 'robotstxt-sitemap_access', sitemap, sitemap_200, '', error, url.protocol, url.domain, url.path, url.query, )
class Pagespeed: def __init__(self, configuration: Configuration, configuration_key: str, connection: Connection): if not connection.has_bigquery() and not connection.has_orm(): raise ConfigurationMissingError( 'Missing a database configuration for this operation') self.configuration = configuration self.module_configuration = configuration.operations.get_custom_configuration_operation( configuration_key) self.mongodb = connection.mongodb self.check_service = Check(connection) def run(self): if len(self.module_configuration.checks) > 0: print('Running operation pagespeed:', "\n") if not self.mongodb.has_collection( PagespeedAggregationModule.COLLECTION_NAME): return pagespeed_tests = self.mongodb.find( PagespeedAggregationModule.COLLECTION_NAME, {'processed_pagespeed': { '$exists': False }}) for pagespeed_test in pagespeed_tests: print(' + ' + str(pagespeed_test['url'])) pagespeed_json_desktop = json.loads( pagespeed_test['desktop']['body']) pagespeed_json_mobile = json.loads( pagespeed_test['mobile']['body']) urlset_name = pagespeed_test['urlset'] url = pagespeed_test['url'] self.check_fcp_score(urlset_name, url, 'fcp_score', pagespeed_json_desktop, 'desktop') self.check_fcp_score(urlset_name, url, 'fcp_score', pagespeed_json_mobile, 'mobile') self.check_fcp_display(urlset_name, url, 'fcp_display', pagespeed_json_desktop, 'desktop') self.check_fcp_display(urlset_name, url, 'fcp_display', pagespeed_json_mobile, 'mobile') self.check_tti_score(urlset_name, url, 'tti_score', pagespeed_json_desktop, 'desktop') self.check_tti_score(urlset_name, url, 'tti_score', pagespeed_json_mobile, 'mobile') self.check_tti_display(urlset_name, url, 'tti_display', pagespeed_json_desktop, 'desktop') self.check_tti_display(urlset_name, url, 'tti_display', pagespeed_json_mobile, 'mobile') self.check_ttfb_score(urlset_name, url, 'ttfb_score', pagespeed_json_desktop, 'desktop') self.check_ttfb_score(urlset_name, url, 'ttfb_score', pagespeed_json_mobile, 'mobile') self.check_ttfb_display(urlset_name, url, 'ttfb_display', pagespeed_json_desktop, 'desktop') self.check_ttfb_display(urlset_name, url, 'ttfb_display', pagespeed_json_mobile, 'mobile') self.check_performance_score(urlset_name, url, 'performance_score', pagespeed_json_desktop, 'desktop') self.check_performance_score(urlset_name, url, 'performance_score', pagespeed_json_mobile, 'mobile') self.check_uses_optimized_images(urlset_name, url, 'uses_optimized_images', pagespeed_json_desktop, 'desktop') self.check_uses_optimized_images(urlset_name, url, 'uses_optimized_images', pagespeed_json_mobile, 'mobile') self.check_render_blocking_resources( urlset_name, url, 'render_blocking_resources', pagespeed_json_desktop, 'desktop') self.check_render_blocking_resources( urlset_name, url, 'render_blocking_resources', pagespeed_json_mobile, 'mobile') self.check_uses_text_compression(urlset_name, url, 'uses_text_compression', pagespeed_json_desktop, 'desktop') self.check_uses_text_compression(urlset_name, url, 'uses_text_compression', pagespeed_json_mobile, 'mobile') self.check_uses_long_cache_ttl(urlset_name, url, 'uses_long_cache_ttl', pagespeed_json_desktop, 'desktop') self.check_uses_long_cache_ttl(urlset_name, url, 'uses_long_cache_ttl', pagespeed_json_mobile, 'mobile') self.check_unminified_css(urlset_name, url, 'unminified_css', pagespeed_json_desktop, 'desktop') self.check_unminified_css(urlset_name, url, 'unminified_css', pagespeed_json_mobile, 'mobile') self.check_unminified_js(urlset_name, url, 'unminified_js', pagespeed_json_desktop, 'desktop') self.check_unminified_js(urlset_name, url, 'unminified_js', pagespeed_json_mobile, 'mobile') self.mongodb.update_one( PagespeedAggregationModule.COLLECTION_NAME, pagespeed_test['_id'], {'processed_pagespeed': True}) print("\n") def check_fcp_score(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits'] ['first-contentful-paint']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-fcp_score_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_fcp_display(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits'] ['first-contentful-paint']['numericValue']) if result <= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-fcp_display_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_tti_score(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float( j['lighthouseResult']['audits']['interactive']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-time_to_interactive_score_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_tti_display(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits']['interactive'] ['numericValue']) if result <= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-time_to_interactive_display_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_ttfb_score(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits'] ['time-to-first-byte']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-ttfb_score_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_ttfb_display(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits'] ['time-to-first-byte']['numericValue']) if result <= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-ttfb_display_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_performance_score(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['categories'] ['performance']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-performance_score_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_render_blocking_resources(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits'] ['render-blocking-resources']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-render_blocking_resources_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_uses_optimized_images(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits'] ['uses-optimized-images']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-uses_optimized_images_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_uses_text_compression(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits'] ['uses-text-compression']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-uses_text_compression_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_uses_long_cache_ttl(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits'] ['uses-long-cache-ttl']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-uses_long_cache_ttl_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_unminified_css(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float( j['lighthouseResult']['audits']['unminified-css']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-unminified_css_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_unminified_js(self, urlset_name: str, url: URL, check: str, j: dict, strategy: str): if check in self.module_configuration.checks: assert_val = self.module_configuration.checks[check][strategy] print(' -> check_' + check + ' "' + str(assert_val) + '"', end='') valid = False result = '' if 'lighthouseResult' in j: result = float(j['lighthouseResult']['audits'] ['unminified-javascript']['score']) if result >= assert_val: valid = True self.check_service.add_check( self.module_configuration.database, urlset_name, 'pagespeed-unminified_javascript_' + strategy, str(result), valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid))
class Htmlheadings: def __init__(self, configuration: Configuration, configuration_key: str, connection: Connection): if not connection.has_bigquery() and not connection.has_orm(): raise ConfigurationMissingError( 'Missing a database configuration for this operation') self.configuration = configuration self.module_configuration = configuration.operations.get_custom_configuration_operation( configuration_key) self.mongodb = connection.mongodb self.check_service = Check(connection) def run(self): if len(self.module_configuration.urlsets) > 0: print('Running operation htmlheadings:', "\n") if not self.mongodb.has_collection(HtmlParser.COLLECTION_NAME): return for urlset in self.module_configuration.urlsets: print(' - "' + str(urlset['url']) + '":') for single_urlset in urlset: urlset_name = urlset[single_urlset] parsed_data = self.mongodb.find( HtmlParser.COLLECTION_NAME, { 'urlset': urlset_name, 'processed_htmlheadings': { '$exists': False } }) urlset_config = urlset['checks'] for data in parsed_data: print(' + ' + str(data['url'])) self.check_count_headline_h1(data, urlset_config) self.mongodb.update_one( HtmlParser.COLLECTION_NAME, data['_id'], {'processed_htmlheadings': True}) print("\n") def check_count_headline_h1(self, data: dict, urlset_config: dict): if 'count_headline_h1' in urlset_config: assert_val = urlset_config['count_headline_h1'] print(' -> check_count_headline_h1 "' + str(assert_val) + '"', end='') valid = False error = '' doc = BeautifulSoup(data['body'], "html.parser") count_headline = 0 for headline in doc.select("h1"): count_headline += 1 if count_headline == assert_val: valid = True if count_headline > 1 and not valid: error = 'more than one headline detected' url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'htmlheadings-count_headline_h1', str(count_headline), valid, '', error, url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid))
class Metatags: def __init__(self, configuration: Configuration, configuration_key: str, connection: Connection): if not connection.has_bigquery() and not connection.has_orm(): raise ConfigurationMissingError( 'Missing a database configuration for this operation') self.configuration = configuration self.module_configuration = configuration.operations.get_custom_configuration_operation( configuration_key) self.mongodb = connection.mongodb self.check_service = Check(connection) def run(self): if len(self.module_configuration.urlsets) > 0: print('Running operation metatags:', "\n") if not self.mongodb.has_collection(HtmlParser.COLLECTION_NAME): return for urlset in self.module_configuration.urlsets: print(' - "' + str(urlset['url']) + '":') for single_urlset in urlset: urlset_name = urlset[single_urlset] parsed_data = self.mongodb.find( HtmlParser.COLLECTION_NAME, { 'urlset': urlset_name, 'processed_metatags': { '$exists': False } }) urlset_config = urlset['checks'] self.check_has_title_duplicates(parsed_data, urlset_name, urlset_config) self.check_has_description_duplicates( parsed_data, urlset_name, urlset_config) for data in parsed_data: print(' + ' + str(data['url'])) self.check_has_title(data, urlset_name, urlset_config) self.check_is_title_empty(data, urlset_name, urlset_config) self.check_has_title_changed(data, urlset_name, urlset_config) self.check_has_description(data, urlset_name, urlset_config) self.check_is_description_empty( data, urlset_name, urlset_config) self.check_has_description_changed( data, urlset_name, urlset_config) self.check_has_canonical(data, urlset_name, urlset_config) self.check_canonical_is_self_referencing( data, urlset_name, urlset_config) self.check_canonical_href_200(data, urlset_name, urlset_config) self.mongodb.update_one(HtmlParser.COLLECTION_NAME, data['_id'], {'processed_metatags': True}) print("\n") # METATAG TITLE def get_metatitle(self, data: dict, urlset_name: str, urlset_config: dict): if 'title' in urlset_config: doc = BeautifulSoup(data['body'], "html.parser") titles = doc.find_all("title") problem_detected = {'multi': False, 'empty': False} if titles: if len(titles) > 1: problem_detected['multi'] = True else: return titles else: problem_detected['empty'] = True return problem_detected def save_problem_multi_title(self, multi: bool, data): url = data['url'] value = '' error = '' valid = False if multi: error = 'several titletags on page detected' else: valid = True self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-has_multiple_titles', value, valid, '', error, url.protocol, url.domain, url.path, url.query, ) def check_has_title(self, data: dict, urlset_name: str, urlset_config: dict): if 'title' in urlset_config: if 'has_title' in urlset_config['title']: assert_val = urlset_config['title']['has_title'] print(' -> check_has_title "' + str(assert_val) + '"', end='') valid = False multi = False empty = False titles = {} titles = self.get_metatitle(data, urlset_name, urlset_config) if 'multi' in titles: if titles['multi']: multi = True else: value = '' if titles: for title in titles: if title != '': value = str(title) exists = True if exists == assert_val: valid = True url = data['url'] error = '' if len(titles) == 0 and not valid: error = 'title missing' self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-has_title', value, valid, '', error, url.protocol, url.domain, url.path, url.query, ) print(' ... has title ' + str(valid)) self.save_problem_multi_title(multi, data) def check_is_title_empty(self, data: dict, urlset_name: str, urlset_config: dict): if 'title' in urlset_config: if 'is_title_empty' in urlset_config['title']: assert_val = urlset_config['title']['is_title_empty'] print(' -> check_has_title "' + str(assert_val) + '"', end='') valid = False titles = self.get_metatitle(data, urlset_name, urlset_config) value = '' empty = False for title in titles: value = str(title) if title == '': empty = True if empty == assert_val: valid = True url = data['url'] error = '' if empty and valid: error = 'titletag is empty' self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-is_title_empty', value, valid, '', error, url.protocol, url.domain, url.path, url.query, ) print(' ... is title empty ' + str(valid)) def check_has_title_changed(self, data: dict, urlset_name: str, urlset_config: dict): if 'title' in urlset_config: if 'has_title_changed' in urlset_config['title']: assert_val = urlset_config['title']['has_title_changed'] valid = False titles_new = self.get_metatitle(data, urlset_name, urlset_config) value_new = '' if len(titles_new) == 1: for title in titles_new: if title != '': value_new = str(title) last_parsed_data = self.mongodb.find_last_sorted( HtmlParser.COLLECTION_NAME, { 'url.protocol': data['url'].protocol, 'url.domain': data['url'].domain, 'url.path': data['url'].path, 'url.query': data['url'].query, 'processed_metatags': { '$exists': True } }, [('date', -1)]) value_last = '' for last_data in last_parsed_data: titles_last = self.get_metatitle(last_data, urlset_name, urlset_config) if len(titles_last) == 1: for title in titles_last: if title != '': value_last = str(title) check_result = True if value_new == value_last: check_result = False # title has not changed if check_result == assert_val: valid = True diff = '' error = '' if not valid and check_result: diff = str(value_last) error = 'title has changed' url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-has_title_changed', value_new, valid, diff, error, url.protocol, url.domain, url.path, url.query, ) print(' ... has title changed ' + str(valid)) def check_has_title_duplicates(self, parsed_data: dict, urlset_name: str, urlset_config: dict): if 'title' in urlset_config: if 'has_title_duplicates' in urlset_config['title']: assert_val = urlset_config['title']['has_title_duplicates'] valid = True titles_dict = {} for data in parsed_data: # dict_key = str(data['url']) doc = BeautifulSoup(data['body'], "html.parser") titles = doc.find_all("title") if len(titles) == 1: for title in titles: if title != '': titles_dict[str(data['url'])] = title title_sorted = {} # geeksforgeeks.org/python-find-keys-with-duplicate-values-in-dictionary/ for key, value in titles_dict.items(): if value not in title_sorted: title_sorted[value] = [key] else: title_sorted[value].append(key) title_duplicates = {} for key_title, value_urls in title_sorted.items(): if len(title_sorted[key_title]) > 1: title_duplicates[key_title] = value_urls elif len(title_sorted[key_title]) == 1: url = '' for url_str in value_urls: url = URL(url_str) valid = False dup = False if dup == assert_val: valid = True value = str(key_title) urlset = '' for data in parsed_data: urlset = data['urlset'] self.check_service.add_check( self.module_configuration.database, urlset, 'metatags-has_title_duplicates', value, valid, '', '', url.protocol, url.domain, url.path, url.query, ) for dup_title in title_duplicates: url = '' for problem_url in title_duplicates[dup_title]: url = URL(problem_url) valid = False dup = True if dup == assert_val: valid = True value = str(dup_title) diff = '' for other_url in title_duplicates[dup_title]: if other_url is not problem_url: if diff == '': diff += other_url else: diff += ', ' + other_url urlset = '' for data in parsed_data: urlset = data['urlset'] error = '' if dup and not valid: error = 'title duplicates in url-set detected' self.check_service.add_check( self.module_configuration.database, urlset, 'metatags-has_title_duplicates', value, valid, diff, error, url.protocol, url.domain, url.path, url.query, ) # METATAG DESCRIPTION def get_metadescription(self, data: dict, urlset_name: str, urlset_config: dict): if 'description' in urlset_config: doc = BeautifulSoup(data['body'], "html.parser") metas = doc.find_all("meta", attrs={'name': 'description'}) problem_detected = {'multi': False, 'empty': False} if metas: if len(metas) > 1: problem_detected['multi'] = True else: return metas else: problem_detected['empty'] = True return problem_detected def save_problem_multi_description(self, multi: bool, data): url = data['url'] value = '' error = '' valid = False if multi: error = 'several descriptiontags on page detected' else: valid = True self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-has_multiple_descriptions', value, valid, '', error, url.protocol, url.domain, url.path, url.query, ) def check_has_description(self, data: dict, urlset_name: str, urlset_config: dict): if 'description' in urlset_config: if 'has_description' in urlset_config['description']: assert_val = urlset_config['description']['has_description'] print(' -> check_has_title "' + str(assert_val) + '"', end='') valid = False multi = False empty = False titles = {} metas = self.get_metadescription(data, urlset_name, urlset_config) if 'multi' in metas: if metas['multi']: multi = True else: value = '' for meta in metas: metadescription = meta.get('content') if metadescription != '': value = metadescription exists = True if exists == assert_val: valid = True url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-has_description', value, valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... has title ' + str(valid)) self.save_problem_multi_description(multi, data) def check_is_description_empty(self, data: dict, urlset_name: str, urlset_config: dict): if 'description' in urlset_config: if 'is_description_empty' in urlset_config['description']: assert_val = urlset_config['description'][ 'is_description_empty'] print(' -> check_has_title "' + str(assert_val) + '"', end='') valid = False metas = self.get_metadescription(data, urlset_name, urlset_config) empty = False if 'multi' in metas: if metas['multi']: return else: value = '' for meta in metas: metadescription = meta.get('content') value = metadescription if metadescription == '': empty = True if empty == assert_val: valid = True error = '' if empty and not valid: error = 'description is empty' url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-is_description_empty', value, valid, '', error, url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_has_description_changed(self, data: dict, urlset_name: str, urlset_config: dict): if 'description' in urlset_config: if 'has_description_changed' in urlset_config['description']: assert_val = urlset_config['description'][ 'has_description_changed'] valid = False descriptions_new = self.get_metadescription( data, urlset_name, urlset_config) value_new = '' if len(descriptions_new) == 1: for description in descriptions_new: if description != '': value_new = str(description) last_parsed_data = self.mongodb.find_last_sorted( HtmlParser.COLLECTION_NAME, { 'url.protocol': data['url'].protocol, 'url.domain': data['url'].domain, 'url.path': data['url'].path, 'url.query': data['url'].query, 'processed_metatags': { '$exists': True } }, [('date', -1)]) value_last = '' for last_data in last_parsed_data: descriptions_last = self.get_metadescription( last_data, urlset_name, urlset_config) if len(descriptions_last) == 1: for description in descriptions_last: if description != '': value_last = str(description) check_result = True if value_new == value_last: check_result = False # description has not changed if check_result == assert_val: valid = True diff = '' if not valid: diff = str(value_last) error = '' if not valid and check_result: error = 'description has changed' url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-has_description_changed', value_new, valid, diff, error, url.protocol, url.domain, url.path, url.query, ) def check_has_description_duplicates(self, parsed_data: dict, urlset_name: str, urlset_config: dict): if 'description' in urlset_config: if 'has_description_duplicates' in urlset_config['description']: assert_val = urlset_config['description'][ 'has_description_duplicates'] valid = True descriptions_dict = {} for data in parsed_data: # dict_key = str(data['url']) doc = BeautifulSoup(data['body'], "html.parser") descriptions = doc.find_all("meta", attrs={'name': 'description'}) if len(descriptions) == 1: for description in descriptions: if description.get('content') != '': descriptions_dict[str( data['url'])] = description.get('content') description_sorted = {} # geeksforgeeks.org/python-find-keys-with-duplicate-values-in-dictionary/ for key, value in descriptions_dict.items(): if value not in description_sorted: description_sorted[value] = [key] else: description_sorted[value].append(key) description_duplicates = {} for key_description, value_urls in description_sorted.items(): if len(description_sorted[key_description]) > 1: description_duplicates[key_description] = value_urls elif len(description_sorted[key_description]) == 1: url = '' for url_str in value_urls: url = URL(url_str) valid = False dup = False if dup == assert_val: valid = True value = str(key_description) urlset = '' for data in parsed_data: urlset = data['urlset'] self.check_service.add_check( self.module_configuration.database, urlset, 'metatags-has_description_duplicates', value, valid, '', '', url.protocol, url.domain, url.path, url.query, ) for dup_description in description_duplicates: url = '' for problem_url in description_duplicates[dup_description]: url = URL(problem_url) valid = False dup = True if dup == assert_val: valid = True value = str(dup_description) diff = '' for other_url in description_duplicates[ dup_description]: if other_url is not problem_url: if diff == '': diff += other_url else: diff += ', ' + other_url urlset = '' for data in parsed_data: urlset = data['urlset'] error = '' if dup and not valid: error = 'description duplicates in url-set detected' self.check_service.add_check( self.module_configuration.database, urlset, 'metatags-has_description_duplicates', value, valid, diff, error, url.protocol, url.domain, url.path, url.query, ) # METATAG CANONICAL def get_canonical_href(self, data: dict, urlset_name: str, urlset_config: dict): if 'canonical' in urlset_config: doc = BeautifulSoup(data['body'], "html.parser") links = doc.find_all("link", rel='canonical') href = '' for link in links: href = link['href'] return href def check_has_canonical(self, data: dict, urlset_name: str, urlset_config: dict): if 'canonical' in urlset_config: if 'has_canonical' in urlset_config['canonical']: assert_val = urlset_config['canonical']['has_canonical'] print(' -> check_has_canonical "' + str(assert_val) + '"', end='') valid = False exists = False canonical_href = self.get_canonical_href( data, urlset_name, urlset_config) value = str(canonical_href) if canonical_href != '': exists = True if exists == assert_val: valid = True url = data['url'] error = '' if not exists and not valid: error = 'no canonical' self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-has_canonical', value, valid, '', error, url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_canonical_is_self_referencing(self, data: dict, urlset_name: str, urlset_config: dict): if 'canonical' in urlset_config: if 'canonical_is_self_referencing' in urlset_config['canonical']: assert_val = urlset_config['canonical'][ 'canonical_is_self_referencing'] valid = False url = data['url'] canonical_href = self.get_canonical_href( data, urlset_name, urlset_config) value = str(canonical_href) if canonical_href != '': if canonical_href == str(url): self_referencing = True if self_referencing == assert_val: valid = True self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-canonical_is_self_referencing', value, valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + 'self_referencing' + str(valid)) def check_canonical_href_200(self, data: dict, urlset_name: str, urlset_config: dict): if 'canonical' in urlset_config: if 'canonical_href_200' in urlset_config['canonical']: assert_val = urlset_config['canonical']['canonical_href_200'] valid = False url = data['url'] response_200 = False error = '' canonical_href = self.get_canonical_href( data, urlset_name, urlset_config) value = str(canonical_href) if canonical_href != '': response = requests.get(canonical_href) if response.status_code == 200: response_200 = True else: error = 'href in canonical not valid' if response_200 == assert_val: valid = True self.check_service.add_check( self.module_configuration.database, data['urlset'], 'metatags-canonical_href_200', value, valid, '', error, url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid))
class Responseheader: def __init__(self, configuration: Configuration, configuration_key: str, connection: Connection): if not connection.has_bigquery() and not connection.has_orm(): raise ConfigurationMissingError( 'Missing a database configuration for this operation') self.configuration = configuration self.module_configuration = configuration.operations.get_custom_configuration_operation( configuration_key) self.mongodb = connection.mongodb self.check_service = Check(connection) def run(self): if len(self.module_configuration.urlsets) > 0: print('Running operation responseheader:', "\n") if not self.mongodb.has_collection(HtmlParser.COLLECTION_NAME): return for urlset in self.module_configuration.urlsets: print(' - "' + str(urlset['url']) + '":') for single_urlset in urlset: urlset_name = urlset[single_urlset] parsed_data = self.mongodb.find( HtmlParser.COLLECTION_NAME, { 'urlset': urlset_name, 'processed_htmlheadings': { '$exists': False } }) urlset_config = urlset['checks'] for data in parsed_data: print(' + ' + str(data['url'])) self.check_status_code(data, urlset_config) self.check_content_encoding(data, urlset_config) self.check_cache_control(data, urlset_config) self.check_expires(data, urlset_config) self.check_x_canonical(data, urlset_config) self.check_no_index(data, urlset_config) self.mongodb.update_one( HtmlParser.COLLECTION_NAME, data['_id'], {'processed_responseheader': True}) print("\n") def check_status_code(self, data: dict, urlset_config: dict): if 'status_code' in urlset_config: assert_val = urlset_config['status_code']['assert'] print(' -> check_status_code "' + str(assert_val) + '"', end='') valid = False if data['status_code'] == assert_val: valid = True url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'responseheader-status_code', '', valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_content_encoding(self, data: dict, urlset_config: dict): if 'content_encoding' in urlset_config: assert_val = urlset_config['content_encoding']['assert'] # transform all headers (key,values) to lowercase headers = dict( (k.lower(), v.lower()) for k, v in data['headers'].items()) print(' -> check_content_encoding "' + str(assert_val) + '"', end='') valid = False if 'content-encoding' in headers: if headers['content-encoding'] == assert_val: valid = True url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'responseheader-content_encoding', '', valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_cache_control(self, data: dict, urlset_config: dict): if 'cache_control' in urlset_config: assert_val = urlset_config['cache_control']['assert'] # transform all headers (key,values) to lowercase headers = dict( (k.lower(), v.lower()) for k, v in data['headers'].items()) print(' -> check_cache_control "' + str(assert_val) + '"', end='') valid = False if 'cache-control' in headers: if headers['cache-control'] == assert_val: valid = True url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'responseheader-cache_control', '', valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_expires(self, data: dict, urlset_config: dict): if 'expires' in urlset_config: assert_val = urlset_config['expires']['assert'] # transform all headers (key,values) to lowercase headers = dict( (k.lower(), v.lower()) for k, v in data['headers'].items()) print(' -> check_expires "' + str(assert_val) + '"', end='') valid = False if 'expires' in headers: if headers['expires'] == assert_val: valid = True url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'responseheader-expires', '', valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_x_canonical(self, data: dict, urlset_config: dict): if 'x_canonical' in urlset_config: assert_val = urlset_config['x_canonical']['assert'] # transform all headers (key,values) to lowercase headers = dict( (k.lower(), v.lower()) for k, v in data['headers'].items()) print(' -> check_x_canonical "' + str(assert_val) + '"', end='') valid = False if 'x-canonical' in headers: if headers['x-canonical'] == assert_val: valid = True url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'responseheader-x_canonical', '', valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid)) def check_no_index(self, data: dict, urlset_config: dict): if 'no_index' in urlset_config: assert_val = urlset_config['no_index']['assert'] # transform all headers (key,values) to lowercase headers = dict( (k.lower(), v.lower()) for k, v in data['headers'].items()) print(' -> check_no_index "' + str(assert_val) + '"', end='') valid = False if 'no-index' in headers: if headers['no-index'] == assert_val: valid = True url = data['url'] self.check_service.add_check( self.module_configuration.database, data['urlset'], 'responseheader-no_index', '', valid, '', '', url.protocol, url.domain, url.path, url.query, ) print(' ... ' + str(valid))