예제 #1
0
class Robotstxt:
    def __init__(self, configuration: Configuration, configuration_key: str,
                 connection: Connection):
        if not connection.has_bigquery() and not connection.has_orm():
            raise ConfigurationMissingError(
                'Missing a database configuration for this operation')

        self.configuration = configuration
        self.module_configuration = configuration.operations.get_custom_configuration_operation(
            configuration_key)
        self.mongodb = connection.mongodb
        self.check_service = Check(connection)
        self.robotsparser = urllib.robotparser.RobotFileParser()

    def run(self):
        if len(self.module_configuration.urlsets) > 0:
            print('Running operation robotstxt:', "\n")

            if not self.mongodb.has_collection(
                    AggregationRobotstxt.COLLECTION_NAME):
                return

            for urlset in self.module_configuration.urlsets:
                for single_urlset in urlset:
                    urlset_name = urlset[single_urlset]

                    robotstxts = self.mongodb.find(
                        RobotstxtAggregationModule.COLLECTION_NAME, {
                            'urlset': urlset_name,
                            'processed_robotstxt': {
                                '$exists': False
                            }
                        })

                    urlset_config = urlset['checks']

                    for url in self.configuration.urlsets.urlset_urls(
                            urlset_name):
                        urlstr = str(url)
                        if not urlstr.endswith('/robots.txt'):
                            url = url.protocol + '://' + url.domain + str.rstrip(
                                url.path, '/') + '/robots.txt'
                        for robotstxt in robotstxts:
                            if str(robotstxt['url']) == str(url):

                                print(' + ' + str(robotstxt['url']))

                                self.check_status_code(robotstxt,
                                                       urlset_config)
                                self.check_has_sitemap_xml(
                                    robotstxt, urlset_config)

                                self.mongodb.update_one(
                                    RobotstxtAggregationModule.COLLECTION_NAME,
                                    robotstxt['_id'],
                                    {'processed_robotstxt': True})

                            print("\n")

    def request_url_statuscode(self, url):

        try:
            headers = {
                'User-agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
            }

            response = requests.get(url, headers=headers)
            status_code = response.status_code

        except requests.RequestException as error:
            status_code = None

        return status_code

    def check_status_code(self, robotstxt: dict, urlset_config: dict):
        if 'status_code' in urlset_config:
            assert_val = urlset_config['status_code']

            print('      -> check_status_code "' + str(assert_val) + '"',
                  end='')

            valid = False

            if 'status_code' in robotstxt:
                if robotstxt['status_code'] == assert_val:
                    valid = True

            url = robotstxt['url']

            self.check_service.add_check(
                self.module_configuration.database,
                robotstxt['urlset'],
                'robotstxt-status_code',
                robotstxt['body'],
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

    def check_has_sitemap_xml(self, robotstxt: dict, urlset_config: dict):
        if 'has_sitemap_xml' in urlset_config:
            assert_val_has_sitemap = urlset_config['has_sitemap_xml']

            has_sitemap = False

            if 'body' in robotstxt:
                robotsbody = robotstxt['body']
                self.robotsparser.parse(robotsbody.splitlines())

                sitemaps = self.robotsparser.site_maps()
                if sitemaps:
                    has_sitemap = True

                valid = False
                if has_sitemap == assert_val_has_sitemap:
                    valid = True

                url = robotstxt['url']

                self.check_service.add_check(
                    self.module_configuration.database,
                    robotstxt['urlset'],
                    'robotstxt-has_sitemap_xml',
                    str(url),
                    valid,
                    '',
                    '',
                    url.protocol,
                    url.domain,
                    url.path,
                    url.query,
                )

                if sitemaps:
                    for sitemap in sitemaps:

                        error = ''
                        sitemap_200 = False

                        try:
                            headers = {
                                'User-agent':
                                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'
                            }

                            response = requests.get(sitemap, headers=headers)
                            status_code = response.status_code
                        except requests.RequestException as err:
                            status_code = None

                        if status_code == 200:
                            sitemap_200 = True

                        if not sitemap_200:
                            error = 'No access to sitemap'

                        self.check_service.add_check(
                            self.module_configuration.database,
                            robotstxt['urlset'],
                            'robotstxt-sitemap_access',
                            sitemap,
                            sitemap_200,
                            '',
                            error,
                            url.protocol,
                            url.domain,
                            url.path,
                            url.query,
                        )
예제 #2
0
class Pagespeed:
    def __init__(self, configuration: Configuration, configuration_key: str,
                 connection: Connection):
        if not connection.has_bigquery() and not connection.has_orm():
            raise ConfigurationMissingError(
                'Missing a database configuration for this operation')

        self.configuration = configuration
        self.module_configuration = configuration.operations.get_custom_configuration_operation(
            configuration_key)
        self.mongodb = connection.mongodb
        self.check_service = Check(connection)

    def run(self):
        if len(self.module_configuration.checks) > 0:
            print('Running operation pagespeed:', "\n")

            if not self.mongodb.has_collection(
                    PagespeedAggregationModule.COLLECTION_NAME):
                return

            pagespeed_tests = self.mongodb.find(
                PagespeedAggregationModule.COLLECTION_NAME,
                {'processed_pagespeed': {
                    '$exists': False
                }})

            for pagespeed_test in pagespeed_tests:
                print(' + ' + str(pagespeed_test['url']))

                pagespeed_json_desktop = json.loads(
                    pagespeed_test['desktop']['body'])
                pagespeed_json_mobile = json.loads(
                    pagespeed_test['mobile']['body'])
                urlset_name = pagespeed_test['urlset']
                url = pagespeed_test['url']

                self.check_fcp_score(urlset_name, url, 'fcp_score',
                                     pagespeed_json_desktop, 'desktop')
                self.check_fcp_score(urlset_name, url, 'fcp_score',
                                     pagespeed_json_mobile, 'mobile')
                self.check_fcp_display(urlset_name, url, 'fcp_display',
                                       pagespeed_json_desktop, 'desktop')
                self.check_fcp_display(urlset_name, url, 'fcp_display',
                                       pagespeed_json_mobile, 'mobile')

                self.check_tti_score(urlset_name, url, 'tti_score',
                                     pagespeed_json_desktop, 'desktop')
                self.check_tti_score(urlset_name, url, 'tti_score',
                                     pagespeed_json_mobile, 'mobile')
                self.check_tti_display(urlset_name, url, 'tti_display',
                                       pagespeed_json_desktop, 'desktop')
                self.check_tti_display(urlset_name, url, 'tti_display',
                                       pagespeed_json_mobile, 'mobile')

                self.check_ttfb_score(urlset_name, url, 'ttfb_score',
                                      pagespeed_json_desktop, 'desktop')
                self.check_ttfb_score(urlset_name, url, 'ttfb_score',
                                      pagespeed_json_mobile, 'mobile')
                self.check_ttfb_display(urlset_name, url, 'ttfb_display',
                                        pagespeed_json_desktop, 'desktop')
                self.check_ttfb_display(urlset_name, url, 'ttfb_display',
                                        pagespeed_json_mobile, 'mobile')

                self.check_performance_score(urlset_name, url,
                                             'performance_score',
                                             pagespeed_json_desktop, 'desktop')

                self.check_performance_score(urlset_name, url,
                                             'performance_score',
                                             pagespeed_json_mobile, 'mobile')

                self.check_uses_optimized_images(urlset_name, url,
                                                 'uses_optimized_images',
                                                 pagespeed_json_desktop,
                                                 'desktop')

                self.check_uses_optimized_images(urlset_name, url,
                                                 'uses_optimized_images',
                                                 pagespeed_json_mobile,
                                                 'mobile')

                self.check_render_blocking_resources(
                    urlset_name, url, 'render_blocking_resources',
                    pagespeed_json_desktop, 'desktop')

                self.check_render_blocking_resources(
                    urlset_name, url, 'render_blocking_resources',
                    pagespeed_json_mobile, 'mobile')

                self.check_uses_text_compression(urlset_name, url,
                                                 'uses_text_compression',
                                                 pagespeed_json_desktop,
                                                 'desktop')

                self.check_uses_text_compression(urlset_name, url,
                                                 'uses_text_compression',
                                                 pagespeed_json_mobile,
                                                 'mobile')

                self.check_uses_long_cache_ttl(urlset_name, url,
                                               'uses_long_cache_ttl',
                                               pagespeed_json_desktop,
                                               'desktop')

                self.check_uses_long_cache_ttl(urlset_name, url,
                                               'uses_long_cache_ttl',
                                               pagespeed_json_mobile, 'mobile')

                self.check_unminified_css(urlset_name, url, 'unminified_css',
                                          pagespeed_json_desktop, 'desktop')
                self.check_unminified_css(urlset_name, url, 'unminified_css',
                                          pagespeed_json_mobile, 'mobile')

                self.check_unminified_js(urlset_name, url, 'unminified_js',
                                         pagespeed_json_desktop, 'desktop')
                self.check_unminified_js(urlset_name, url, 'unminified_js',
                                         pagespeed_json_mobile, 'mobile')

                self.mongodb.update_one(
                    PagespeedAggregationModule.COLLECTION_NAME,
                    pagespeed_test['_id'], {'processed_pagespeed': True})

            print("\n")

    def check_fcp_score(self, urlset_name: str, url: URL, check: str, j: dict,
                        strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']
                               ['first-contentful-paint']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-fcp_score_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_fcp_display(self, urlset_name: str, url: URL, check: str,
                          j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']
                               ['first-contentful-paint']['numericValue'])
                if result <= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-fcp_display_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_tti_score(self, urlset_name: str, url: URL, check: str, j: dict,
                        strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(
                    j['lighthouseResult']['audits']['interactive']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-time_to_interactive_score_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_tti_display(self, urlset_name: str, url: URL, check: str,
                          j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']['interactive']
                               ['numericValue'])
                if result <= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-time_to_interactive_display_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_ttfb_score(self, urlset_name: str, url: URL, check: str, j: dict,
                         strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']
                               ['time-to-first-byte']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-ttfb_score_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_ttfb_display(self, urlset_name: str, url: URL, check: str,
                           j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']
                               ['time-to-first-byte']['numericValue'])
                if result <= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-ttfb_display_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_performance_score(self, urlset_name: str, url: URL, check: str,
                                j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['categories']
                               ['performance']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-performance_score_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_render_blocking_resources(self, urlset_name: str, url: URL,
                                        check: str, j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']
                               ['render-blocking-resources']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-render_blocking_resources_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_uses_optimized_images(self, urlset_name: str, url: URL,
                                    check: str, j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']
                               ['uses-optimized-images']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-uses_optimized_images_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_uses_text_compression(self, urlset_name: str, url: URL,
                                    check: str, j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']
                               ['uses-text-compression']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-uses_text_compression_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_uses_long_cache_ttl(self, urlset_name: str, url: URL, check: str,
                                  j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']
                               ['uses-long-cache-ttl']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-uses_long_cache_ttl_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_unminified_css(self, urlset_name: str, url: URL, check: str,
                             j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''
            if 'lighthouseResult' in j:
                result = float(
                    j['lighthouseResult']['audits']['unminified-css']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-unminified_css_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_unminified_js(self, urlset_name: str, url: URL, check: str,
                            j: dict, strategy: str):
        if check in self.module_configuration.checks:
            assert_val = self.module_configuration.checks[check][strategy]

            print('      -> check_' + check + ' "' + str(assert_val) + '"',
                  end='')

            valid = False
            result = ''

            if 'lighthouseResult' in j:
                result = float(j['lighthouseResult']['audits']
                               ['unminified-javascript']['score'])
                if result >= assert_val:
                    valid = True

            self.check_service.add_check(
                self.module_configuration.database,
                urlset_name,
                'pagespeed-unminified_javascript_' + strategy,
                str(result),
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))
예제 #3
0
class Htmlheadings:
    def __init__(self, configuration: Configuration, configuration_key: str,
                 connection: Connection):
        if not connection.has_bigquery() and not connection.has_orm():
            raise ConfigurationMissingError(
                'Missing a database configuration for this operation')

        self.configuration = configuration
        self.module_configuration = configuration.operations.get_custom_configuration_operation(
            configuration_key)
        self.mongodb = connection.mongodb
        self.check_service = Check(connection)

    def run(self):
        if len(self.module_configuration.urlsets) > 0:
            print('Running operation htmlheadings:', "\n")

            if not self.mongodb.has_collection(HtmlParser.COLLECTION_NAME):
                return

            for urlset in self.module_configuration.urlsets:
                print(' - "' + str(urlset['url']) + '":')

                for single_urlset in urlset:
                    urlset_name = urlset[single_urlset]

                    parsed_data = self.mongodb.find(
                        HtmlParser.COLLECTION_NAME, {
                            'urlset': urlset_name,
                            'processed_htmlheadings': {
                                '$exists': False
                            }
                        })

                    urlset_config = urlset['checks']

                    for data in parsed_data:
                        print('   + ' + str(data['url']))

                        self.check_count_headline_h1(data, urlset_config)

                        self.mongodb.update_one(
                            HtmlParser.COLLECTION_NAME, data['_id'],
                            {'processed_htmlheadings': True})

                print("\n")

    def check_count_headline_h1(self, data: dict, urlset_config: dict):
        if 'count_headline_h1' in urlset_config:
            assert_val = urlset_config['count_headline_h1']

            print('      -> check_count_headline_h1 "' + str(assert_val) + '"',
                  end='')

            valid = False
            error = ''

            doc = BeautifulSoup(data['body'], "html.parser")
            count_headline = 0

            for headline in doc.select("h1"):
                count_headline += 1

            if count_headline == assert_val:
                valid = True

            if count_headline > 1 and not valid:
                error = 'more than one headline detected'

            url = data['url']

            self.check_service.add_check(
                self.module_configuration.database,
                data['urlset'],
                'htmlheadings-count_headline_h1',
                str(count_headline),
                valid,
                '',
                error,
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))
예제 #4
0
class Metatags:
    def __init__(self, configuration: Configuration, configuration_key: str,
                 connection: Connection):
        if not connection.has_bigquery() and not connection.has_orm():
            raise ConfigurationMissingError(
                'Missing a database configuration for this operation')

        self.configuration = configuration
        self.module_configuration = configuration.operations.get_custom_configuration_operation(
            configuration_key)
        self.mongodb = connection.mongodb
        self.check_service = Check(connection)

    def run(self):
        if len(self.module_configuration.urlsets) > 0:
            print('Running operation metatags:', "\n")

            if not self.mongodb.has_collection(HtmlParser.COLLECTION_NAME):
                return

            for urlset in self.module_configuration.urlsets:
                print(' - "' + str(urlset['url']) + '":')

                for single_urlset in urlset:

                    urlset_name = urlset[single_urlset]

                    parsed_data = self.mongodb.find(
                        HtmlParser.COLLECTION_NAME, {
                            'urlset': urlset_name,
                            'processed_metatags': {
                                '$exists': False
                            }
                        })

                    urlset_config = urlset['checks']

                    self.check_has_title_duplicates(parsed_data, urlset_name,
                                                    urlset_config)
                    self.check_has_description_duplicates(
                        parsed_data, urlset_name, urlset_config)

                    for data in parsed_data:
                        print('   + ' + str(data['url']))

                        self.check_has_title(data, urlset_name, urlset_config)
                        self.check_is_title_empty(data, urlset_name,
                                                  urlset_config)
                        self.check_has_title_changed(data, urlset_name,
                                                     urlset_config)

                        self.check_has_description(data, urlset_name,
                                                   urlset_config)
                        self.check_is_description_empty(
                            data, urlset_name, urlset_config)
                        self.check_has_description_changed(
                            data, urlset_name, urlset_config)

                        self.check_has_canonical(data, urlset_name,
                                                 urlset_config)
                        self.check_canonical_is_self_referencing(
                            data, urlset_name, urlset_config)
                        self.check_canonical_href_200(data, urlset_name,
                                                      urlset_config)

                        self.mongodb.update_one(HtmlParser.COLLECTION_NAME,
                                                data['_id'],
                                                {'processed_metatags': True})

                print("\n")

    # METATAG TITLE

    def get_metatitle(self, data: dict, urlset_name: str, urlset_config: dict):
        if 'title' in urlset_config:
            doc = BeautifulSoup(data['body'], "html.parser")
            titles = doc.find_all("title")

            problem_detected = {'multi': False, 'empty': False}
            if titles:
                if len(titles) > 1:
                    problem_detected['multi'] = True
                else:
                    return titles
            else:
                problem_detected['empty'] = True

            return problem_detected

    def save_problem_multi_title(self, multi: bool, data):
        url = data['url']

        value = ''
        error = ''
        valid = False

        if multi:
            error = 'several titletags on page detected'
        else:
            valid = True

        self.check_service.add_check(
            self.module_configuration.database,
            data['urlset'],
            'metatags-has_multiple_titles',
            value,
            valid,
            '',
            error,
            url.protocol,
            url.domain,
            url.path,
            url.query,
        )

    def check_has_title(self, data: dict, urlset_name: str,
                        urlset_config: dict):
        if 'title' in urlset_config:
            if 'has_title' in urlset_config['title']:
                assert_val = urlset_config['title']['has_title']

                print('      -> check_has_title "' + str(assert_val) + '"',
                      end='')

                valid = False
                multi = False
                empty = False
                titles = {}

                titles = self.get_metatitle(data, urlset_name, urlset_config)
                if 'multi' in titles:
                    if titles['multi']:
                        multi = True
                else:
                    value = ''

                    if titles:
                        for title in titles:
                            if title != '':
                                value = str(title)
                                exists = True
                                if exists == assert_val:
                                    valid = True

                    url = data['url']

                    error = ''
                    if len(titles) == 0 and not valid:
                        error = 'title missing'

                    self.check_service.add_check(
                        self.module_configuration.database,
                        data['urlset'],
                        'metatags-has_title',
                        value,
                        valid,
                        '',
                        error,
                        url.protocol,
                        url.domain,
                        url.path,
                        url.query,
                    )

                    print(' ... has title ' + str(valid))

                self.save_problem_multi_title(multi, data)

    def check_is_title_empty(self, data: dict, urlset_name: str,
                             urlset_config: dict):
        if 'title' in urlset_config:
            if 'is_title_empty' in urlset_config['title']:
                assert_val = urlset_config['title']['is_title_empty']

                print('      -> check_has_title "' + str(assert_val) + '"',
                      end='')

                valid = False

                titles = self.get_metatitle(data, urlset_name, urlset_config)
                value = ''

                empty = False

                for title in titles:
                    value = str(title)
                    if title == '':
                        empty = True
                    if empty == assert_val:
                        valid = True

                url = data['url']

                error = ''
                if empty and valid:
                    error = 'titletag is empty'

                self.check_service.add_check(
                    self.module_configuration.database,
                    data['urlset'],
                    'metatags-is_title_empty',
                    value,
                    valid,
                    '',
                    error,
                    url.protocol,
                    url.domain,
                    url.path,
                    url.query,
                )

                print(' ... is title empty ' + str(valid))

    def check_has_title_changed(self, data: dict, urlset_name: str,
                                urlset_config: dict):
        if 'title' in urlset_config:
            if 'has_title_changed' in urlset_config['title']:
                assert_val = urlset_config['title']['has_title_changed']

                valid = False

                titles_new = self.get_metatitle(data, urlset_name,
                                                urlset_config)
                value_new = ''

                if len(titles_new) == 1:
                    for title in titles_new:
                        if title != '':
                            value_new = str(title)

                last_parsed_data = self.mongodb.find_last_sorted(
                    HtmlParser.COLLECTION_NAME, {
                        'url.protocol': data['url'].protocol,
                        'url.domain': data['url'].domain,
                        'url.path': data['url'].path,
                        'url.query': data['url'].query,
                        'processed_metatags': {
                            '$exists': True
                        }
                    }, [('date', -1)])

                value_last = ''
                for last_data in last_parsed_data:

                    titles_last = self.get_metatitle(last_data, urlset_name,
                                                     urlset_config)

                    if len(titles_last) == 1:
                        for title in titles_last:
                            if title != '':
                                value_last = str(title)

                check_result = True
                if value_new == value_last:
                    check_result = False  # title has not changed

                if check_result == assert_val:
                    valid = True

                diff = ''
                error = ''
                if not valid and check_result:
                    diff = str(value_last)
                    error = 'title has changed'

                url = data['url']

                self.check_service.add_check(
                    self.module_configuration.database,
                    data['urlset'],
                    'metatags-has_title_changed',
                    value_new,
                    valid,
                    diff,
                    error,
                    url.protocol,
                    url.domain,
                    url.path,
                    url.query,
                )

                print(' ... has title changed ' + str(valid))

    def check_has_title_duplicates(self, parsed_data: dict, urlset_name: str,
                                   urlset_config: dict):
        if 'title' in urlset_config:
            if 'has_title_duplicates' in urlset_config['title']:
                assert_val = urlset_config['title']['has_title_duplicates']

                valid = True

                titles_dict = {}

                for data in parsed_data:

                    # dict_key = str(data['url'])

                    doc = BeautifulSoup(data['body'], "html.parser")
                    titles = doc.find_all("title")

                    if len(titles) == 1:
                        for title in titles:
                            if title != '':
                                titles_dict[str(data['url'])] = title

                title_sorted = {}

                # geeksforgeeks.org/python-find-keys-with-duplicate-values-in-dictionary/

                for key, value in titles_dict.items():
                    if value not in title_sorted:
                        title_sorted[value] = [key]
                    else:
                        title_sorted[value].append(key)

                title_duplicates = {}

                for key_title, value_urls in title_sorted.items():
                    if len(title_sorted[key_title]) > 1:
                        title_duplicates[key_title] = value_urls
                    elif len(title_sorted[key_title]) == 1:
                        url = ''
                        for url_str in value_urls:
                            url = URL(url_str)
                        valid = False
                        dup = False
                        if dup == assert_val:
                            valid = True
                        value = str(key_title)
                        urlset = ''
                        for data in parsed_data:
                            urlset = data['urlset']

                        self.check_service.add_check(
                            self.module_configuration.database,
                            urlset,
                            'metatags-has_title_duplicates',
                            value,
                            valid,
                            '',
                            '',
                            url.protocol,
                            url.domain,
                            url.path,
                            url.query,
                        )

                for dup_title in title_duplicates:
                    url = ''
                    for problem_url in title_duplicates[dup_title]:
                        url = URL(problem_url)
                        valid = False
                        dup = True
                        if dup == assert_val:
                            valid = True
                        value = str(dup_title)
                        diff = ''
                        for other_url in title_duplicates[dup_title]:
                            if other_url is not problem_url:
                                if diff == '':
                                    diff += other_url
                                else:
                                    diff += ', ' + other_url

                        urlset = ''
                        for data in parsed_data:
                            urlset = data['urlset']

                        error = ''
                        if dup and not valid:
                            error = 'title duplicates in url-set detected'

                        self.check_service.add_check(
                            self.module_configuration.database,
                            urlset,
                            'metatags-has_title_duplicates',
                            value,
                            valid,
                            diff,
                            error,
                            url.protocol,
                            url.domain,
                            url.path,
                            url.query,
                        )

    # METATAG DESCRIPTION

    def get_metadescription(self, data: dict, urlset_name: str,
                            urlset_config: dict):
        if 'description' in urlset_config:
            doc = BeautifulSoup(data['body'], "html.parser")
            metas = doc.find_all("meta", attrs={'name': 'description'})

            problem_detected = {'multi': False, 'empty': False}
            if metas:
                if len(metas) > 1:
                    problem_detected['multi'] = True
                else:
                    return metas
            else:
                problem_detected['empty'] = True

            return problem_detected

    def save_problem_multi_description(self, multi: bool, data):
        url = data['url']

        value = ''
        error = ''
        valid = False

        if multi:
            error = 'several descriptiontags on page detected'
        else:
            valid = True

        self.check_service.add_check(
            self.module_configuration.database,
            data['urlset'],
            'metatags-has_multiple_descriptions',
            value,
            valid,
            '',
            error,
            url.protocol,
            url.domain,
            url.path,
            url.query,
        )

    def check_has_description(self, data: dict, urlset_name: str,
                              urlset_config: dict):
        if 'description' in urlset_config:
            if 'has_description' in urlset_config['description']:
                assert_val = urlset_config['description']['has_description']

                print('      -> check_has_title "' + str(assert_val) + '"',
                      end='')

                valid = False
                multi = False
                empty = False
                titles = {}

                metas = self.get_metadescription(data, urlset_name,
                                                 urlset_config)

                if 'multi' in metas:
                    if metas['multi']:
                        multi = True
                else:
                    value = ''
                    for meta in metas:
                        metadescription = meta.get('content')
                        if metadescription != '':
                            value = metadescription
                            exists = True
                            if exists == assert_val:
                                valid = True

                        url = data['url']

                        self.check_service.add_check(
                            self.module_configuration.database,
                            data['urlset'],
                            'metatags-has_description',
                            value,
                            valid,
                            '',
                            '',
                            url.protocol,
                            url.domain,
                            url.path,
                            url.query,
                        )

                        print(' ... has title ' + str(valid))

                self.save_problem_multi_description(multi, data)

    def check_is_description_empty(self, data: dict, urlset_name: str,
                                   urlset_config: dict):
        if 'description' in urlset_config:
            if 'is_description_empty' in urlset_config['description']:
                assert_val = urlset_config['description'][
                    'is_description_empty']

                print('      -> check_has_title "' + str(assert_val) + '"',
                      end='')

                valid = False

                metas = self.get_metadescription(data, urlset_name,
                                                 urlset_config)
                empty = False

                if 'multi' in metas:
                    if metas['multi']:
                        return
                else:
                    value = ''
                    for meta in metas:
                        metadescription = meta.get('content')
                        value = metadescription
                        if metadescription == '':
                            empty = True
                        if empty == assert_val:
                            valid = True

                    error = ''
                    if empty and not valid:
                        error = 'description is empty'

                    url = data['url']

                    self.check_service.add_check(
                        self.module_configuration.database,
                        data['urlset'],
                        'metatags-is_description_empty',
                        value,
                        valid,
                        '',
                        error,
                        url.protocol,
                        url.domain,
                        url.path,
                        url.query,
                    )

                    print(' ... ' + str(valid))

    def check_has_description_changed(self, data: dict, urlset_name: str,
                                      urlset_config: dict):
        if 'description' in urlset_config:
            if 'has_description_changed' in urlset_config['description']:
                assert_val = urlset_config['description'][
                    'has_description_changed']

                valid = False

                descriptions_new = self.get_metadescription(
                    data, urlset_name, urlset_config)
                value_new = ''

                if len(descriptions_new) == 1:
                    for description in descriptions_new:
                        if description != '':
                            value_new = str(description)

                last_parsed_data = self.mongodb.find_last_sorted(
                    HtmlParser.COLLECTION_NAME, {
                        'url.protocol': data['url'].protocol,
                        'url.domain': data['url'].domain,
                        'url.path': data['url'].path,
                        'url.query': data['url'].query,
                        'processed_metatags': {
                            '$exists': True
                        }
                    }, [('date', -1)])

                value_last = ''
                for last_data in last_parsed_data:

                    descriptions_last = self.get_metadescription(
                        last_data, urlset_name, urlset_config)

                    if len(descriptions_last) == 1:
                        for description in descriptions_last:
                            if description != '':
                                value_last = str(description)

                check_result = True
                if value_new == value_last:
                    check_result = False  # description has not changed

                if check_result == assert_val:
                    valid = True

                diff = ''
                if not valid:
                    diff = str(value_last)

                error = ''
                if not valid and check_result:
                    error = 'description has changed'

                url = data['url']

                self.check_service.add_check(
                    self.module_configuration.database,
                    data['urlset'],
                    'metatags-has_description_changed',
                    value_new,
                    valid,
                    diff,
                    error,
                    url.protocol,
                    url.domain,
                    url.path,
                    url.query,
                )

    def check_has_description_duplicates(self, parsed_data: dict,
                                         urlset_name: str,
                                         urlset_config: dict):
        if 'description' in urlset_config:
            if 'has_description_duplicates' in urlset_config['description']:
                assert_val = urlset_config['description'][
                    'has_description_duplicates']

                valid = True

                descriptions_dict = {}

                for data in parsed_data:

                    # dict_key = str(data['url'])

                    doc = BeautifulSoup(data['body'], "html.parser")
                    descriptions = doc.find_all("meta",
                                                attrs={'name': 'description'})

                    if len(descriptions) == 1:
                        for description in descriptions:
                            if description.get('content') != '':
                                descriptions_dict[str(
                                    data['url'])] = description.get('content')

                description_sorted = {}

                # geeksforgeeks.org/python-find-keys-with-duplicate-values-in-dictionary/

                for key, value in descriptions_dict.items():
                    if value not in description_sorted:
                        description_sorted[value] = [key]
                    else:
                        description_sorted[value].append(key)

                description_duplicates = {}

                for key_description, value_urls in description_sorted.items():
                    if len(description_sorted[key_description]) > 1:
                        description_duplicates[key_description] = value_urls
                    elif len(description_sorted[key_description]) == 1:
                        url = ''
                        for url_str in value_urls:
                            url = URL(url_str)
                        valid = False
                        dup = False
                        if dup == assert_val:
                            valid = True
                        value = str(key_description)
                        urlset = ''
                        for data in parsed_data:
                            urlset = data['urlset']

                        self.check_service.add_check(
                            self.module_configuration.database,
                            urlset,
                            'metatags-has_description_duplicates',
                            value,
                            valid,
                            '',
                            '',
                            url.protocol,
                            url.domain,
                            url.path,
                            url.query,
                        )

                for dup_description in description_duplicates:
                    url = ''
                    for problem_url in description_duplicates[dup_description]:
                        url = URL(problem_url)
                        valid = False
                        dup = True
                        if dup == assert_val:
                            valid = True
                        value = str(dup_description)
                        diff = ''
                        for other_url in description_duplicates[
                                dup_description]:
                            if other_url is not problem_url:
                                if diff == '':
                                    diff += other_url
                                else:
                                    diff += ', ' + other_url

                        urlset = ''
                        for data in parsed_data:
                            urlset = data['urlset']

                        error = ''
                        if dup and not valid:
                            error = 'description duplicates in url-set detected'

                        self.check_service.add_check(
                            self.module_configuration.database,
                            urlset,
                            'metatags-has_description_duplicates',
                            value,
                            valid,
                            diff,
                            error,
                            url.protocol,
                            url.domain,
                            url.path,
                            url.query,
                        )

    # METATAG CANONICAL

    def get_canonical_href(self, data: dict, urlset_name: str,
                           urlset_config: dict):
        if 'canonical' in urlset_config:

            doc = BeautifulSoup(data['body'], "html.parser")
            links = doc.find_all("link", rel='canonical')
            href = ''

            for link in links:
                href = link['href']

            return href

    def check_has_canonical(self, data: dict, urlset_name: str,
                            urlset_config: dict):
        if 'canonical' in urlset_config:
            if 'has_canonical' in urlset_config['canonical']:
                assert_val = urlset_config['canonical']['has_canonical']

                print('      -> check_has_canonical "' + str(assert_val) + '"',
                      end='')

                valid = False
                exists = False

                canonical_href = self.get_canonical_href(
                    data, urlset_name, urlset_config)
                value = str(canonical_href)
                if canonical_href != '':
                    exists = True
                    if exists == assert_val:
                        valid = True

                url = data['url']

                error = ''
                if not exists and not valid:
                    error = 'no canonical'

                self.check_service.add_check(
                    self.module_configuration.database,
                    data['urlset'],
                    'metatags-has_canonical',
                    value,
                    valid,
                    '',
                    error,
                    url.protocol,
                    url.domain,
                    url.path,
                    url.query,
                )

                print(' ... ' + str(valid))

    def check_canonical_is_self_referencing(self, data: dict, urlset_name: str,
                                            urlset_config: dict):
        if 'canonical' in urlset_config:
            if 'canonical_is_self_referencing' in urlset_config['canonical']:
                assert_val = urlset_config['canonical'][
                    'canonical_is_self_referencing']

                valid = False
                url = data['url']

                canonical_href = self.get_canonical_href(
                    data, urlset_name, urlset_config)
                value = str(canonical_href)
                if canonical_href != '':
                    if canonical_href == str(url):
                        self_referencing = True
                        if self_referencing == assert_val:
                            valid = True

                self.check_service.add_check(
                    self.module_configuration.database,
                    data['urlset'],
                    'metatags-canonical_is_self_referencing',
                    value,
                    valid,
                    '',
                    '',
                    url.protocol,
                    url.domain,
                    url.path,
                    url.query,
                )

                print(' ... ' + 'self_referencing' + str(valid))

    def check_canonical_href_200(self, data: dict, urlset_name: str,
                                 urlset_config: dict):
        if 'canonical' in urlset_config:
            if 'canonical_href_200' in urlset_config['canonical']:
                assert_val = urlset_config['canonical']['canonical_href_200']

                valid = False
                url = data['url']

                response_200 = False
                error = ''
                canonical_href = self.get_canonical_href(
                    data, urlset_name, urlset_config)
                value = str(canonical_href)
                if canonical_href != '':
                    response = requests.get(canonical_href)
                    if response.status_code == 200:
                        response_200 = True
                    else:
                        error = 'href in canonical not valid'
                if response_200 == assert_val:
                    valid = True

                self.check_service.add_check(
                    self.module_configuration.database,
                    data['urlset'],
                    'metatags-canonical_href_200',
                    value,
                    valid,
                    '',
                    error,
                    url.protocol,
                    url.domain,
                    url.path,
                    url.query,
                )

                print(' ... ' + str(valid))
예제 #5
0
class Responseheader:
    def __init__(self, configuration: Configuration, configuration_key: str,
                 connection: Connection):
        if not connection.has_bigquery() and not connection.has_orm():
            raise ConfigurationMissingError(
                'Missing a database configuration for this operation')

        self.configuration = configuration
        self.module_configuration = configuration.operations.get_custom_configuration_operation(
            configuration_key)
        self.mongodb = connection.mongodb
        self.check_service = Check(connection)

    def run(self):
        if len(self.module_configuration.urlsets) > 0:
            print('Running operation responseheader:', "\n")

            if not self.mongodb.has_collection(HtmlParser.COLLECTION_NAME):
                return

            for urlset in self.module_configuration.urlsets:
                print(' - "' + str(urlset['url']) + '":')

                for single_urlset in urlset:
                    urlset_name = urlset[single_urlset]

                    parsed_data = self.mongodb.find(
                        HtmlParser.COLLECTION_NAME, {
                            'urlset': urlset_name,
                            'processed_htmlheadings': {
                                '$exists': False
                            }
                        })

                    urlset_config = urlset['checks']

                    for data in parsed_data:
                        print('   + ' + str(data['url']))

                        self.check_status_code(data, urlset_config)
                        self.check_content_encoding(data, urlset_config)
                        self.check_cache_control(data, urlset_config)
                        self.check_expires(data, urlset_config)
                        self.check_x_canonical(data, urlset_config)
                        self.check_no_index(data, urlset_config)

                        self.mongodb.update_one(
                            HtmlParser.COLLECTION_NAME, data['_id'],
                            {'processed_responseheader': True})

                print("\n")

    def check_status_code(self, data: dict, urlset_config: dict):
        if 'status_code' in urlset_config:
            assert_val = urlset_config['status_code']['assert']

            print('      -> check_status_code "' + str(assert_val) + '"',
                  end='')

            valid = False
            if data['status_code'] == assert_val:
                valid = True

            url = data['url']

            self.check_service.add_check(
                self.module_configuration.database,
                data['urlset'],
                'responseheader-status_code',
                '',
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_content_encoding(self, data: dict, urlset_config: dict):
        if 'content_encoding' in urlset_config:
            assert_val = urlset_config['content_encoding']['assert']
            # transform all headers (key,values) to lowercase
            headers = dict(
                (k.lower(), v.lower()) for k, v in data['headers'].items())

            print('      -> check_content_encoding "' + str(assert_val) + '"',
                  end='')

            valid = False
            if 'content-encoding' in headers:
                if headers['content-encoding'] == assert_val:
                    valid = True

            url = data['url']

            self.check_service.add_check(
                self.module_configuration.database,
                data['urlset'],
                'responseheader-content_encoding',
                '',
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_cache_control(self, data: dict, urlset_config: dict):
        if 'cache_control' in urlset_config:
            assert_val = urlset_config['cache_control']['assert']
            # transform all headers (key,values) to lowercase
            headers = dict(
                (k.lower(), v.lower()) for k, v in data['headers'].items())

            print('      -> check_cache_control "' + str(assert_val) + '"',
                  end='')

            valid = False
            if 'cache-control' in headers:
                if headers['cache-control'] == assert_val:
                    valid = True

            url = data['url']

            self.check_service.add_check(
                self.module_configuration.database,
                data['urlset'],
                'responseheader-cache_control',
                '',
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_expires(self, data: dict, urlset_config: dict):
        if 'expires' in urlset_config:
            assert_val = urlset_config['expires']['assert']
            # transform all headers (key,values) to lowercase
            headers = dict(
                (k.lower(), v.lower()) for k, v in data['headers'].items())

            print('      -> check_expires "' + str(assert_val) + '"', end='')

            valid = False
            if 'expires' in headers:
                if headers['expires'] == assert_val:
                    valid = True

            url = data['url']

            self.check_service.add_check(
                self.module_configuration.database,
                data['urlset'],
                'responseheader-expires',
                '',
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_x_canonical(self, data: dict, urlset_config: dict):
        if 'x_canonical' in urlset_config:
            assert_val = urlset_config['x_canonical']['assert']
            # transform all headers (key,values) to lowercase
            headers = dict(
                (k.lower(), v.lower()) for k, v in data['headers'].items())

            print('      -> check_x_canonical "' + str(assert_val) + '"',
                  end='')

            valid = False
            if 'x-canonical' in headers:
                if headers['x-canonical'] == assert_val:
                    valid = True

            url = data['url']

            self.check_service.add_check(
                self.module_configuration.database,
                data['urlset'],
                'responseheader-x_canonical',
                '',
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))

    def check_no_index(self, data: dict, urlset_config: dict):
        if 'no_index' in urlset_config:
            assert_val = urlset_config['no_index']['assert']
            # transform all headers (key,values) to lowercase
            headers = dict(
                (k.lower(), v.lower()) for k, v in data['headers'].items())

            print('      -> check_no_index "' + str(assert_val) + '"', end='')

            valid = False
            if 'no-index' in headers:
                if headers['no-index'] == assert_val:
                    valid = True

            url = data['url']

            self.check_service.add_check(
                self.module_configuration.database,
                data['urlset'],
                'responseheader-no_index',
                '',
                valid,
                '',
                '',
                url.protocol,
                url.domain,
                url.path,
                url.query,
            )

            print(' ... ' + str(valid))