def find_string(to_find, check_url):
    check_url = check_url.strip()

    try:
        get_content = helper.httpRequestGetContent(check_url)
        # soup = BeautifulSoup(get_content, "html.parser") # to use if not checking HTML code

        if to_find in get_content:
            return True
        else:
            return False
    except:
        print(
            'Error! Unfortunately the request for URL "{0}" failed, message:\n{1}'
            .format(check_url,
                    sys.exc_info()[0]))
        pass


# För svenska
# Under 25  Barnböcker.
# 25 till 30  Enkla texter.
# 30 till 40  Normaltext / skönlitteratur.
# 40 till 50  Sakinformation, till exempel Wikipedia.
# 50 till 60  Facktexter.
# Över 60 Svåra facktexter / forskning / avhandlingar.

# för engelska
# https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
예제 #2
0
def check_lighthouse(url, strategy='mobile', category='performance'):
    """
    perf = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=performance&strategy=mobile&url=YOUR-SITE&key=YOUR-KEY
    a11y = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=accessibility&strategy=mobile&url=YOUR-SITE&key=YOUR-KEY
    practise = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=best-practices&strategy=mobile&url=YOUR-SITE&key=YOUR-KEY
    pwa = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=pwa&strategy=mobile&url=YOUR-SITE&key=YOUR-KEY
    seo = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=seo&url=YOUR-SITE&key=YOUR-KEY
    """
    check_url = url.strip()
    
    pagespeed_api_request = 'https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category={0}&url={1}&key={2}'.format(category, check_url, privatekeys.googlePageSpeedApiKey)
    
    get_content = ''
    
    try:
        get_content = helper.httpRequestGetContent(pagespeed_api_request)
    except:  # breaking and hoping for more luck with the next URL
        print(
            'Error! Unfortunately the request for URL "{0}" failed, message:\n{1}'.format(
                check_url, sys.exc_info()[0]))
        pass
    
    #print('Checked \'{0}\' successfully against Google\'s API!'.format(pagespeed_api_request))
    json_content = ''

    try:
        json_content = json.loads(get_content)
    except:  # might crash if checked resource is not a webpage
        print('Error! JSON failed parsing for the URL "{0}"\nMessage:\n{1}'.format(
            check_url, sys.exc_info()[0]))
        pass
    
    #print(json_content)

    return_dict = {}
    return_dict = json_content['lighthouseResult']['audits']['metrics']['details']['items'][0]
    
    for item in json_content['lighthouseResult']['audits'].keys():
        try:
            return_dict[item] = json_content['lighthouseResult']['audits'][item]['numericValue']
        except:
            # has no 'numericValue'
            #print(item, 'har inget värde')
            pass
    
    return return_dict
예제 #3
0
def thirdPartiesCheck(url):
    """Checking third parties used on the URL

    Attributes: string url
    """
    get_content = helper.httpRequestGetContent(url)
    get_content = BeautifulSoup(get_content, "html.parser")

    for findings in get_content.select('img[src*="//"]'):
        print(findings)

    for findings in get_content.select('iframe[src*="//"]'):
        print(findings)
    for findings in get_content.select('link[href*="//"]'):
        print(findings)
    for findings in get_content.select('script[src*="//"]'):
        print(findings)
    for findings in get_content.select('script[src*="//"]'):
        print(findings)
예제 #4
0
def google_pagespeed_check(check_url, strategy='mobile'):
    """Checks the Pagespeed Insights with Google 
    In addition to the 'mobile' strategy there is also 'desktop' aimed at the desktop user's preferences
    Returns a dictionary of the results.

    attributes: check_url, strategy
    """
    check_url = check_url.strip()

    # urlEncodedURL = parse.quote_plus(check_url)	# making sure no spaces or other weird characters f*cks up the request, such as HTTP 400
    pagespeed_api_request = 'https://www.googleapis.com/pagespeedonline/v4/runPagespeed?url={}&strategy={}&key={}'.format(
        check_url, strategy, privatekeys.googlePageSpeedApiKey)
    # print('HTTP request towards GPS API: {}'.format(pagespeed_api_request))

    responsecontents = ""
    get_content = ""

    try:
        get_content = helper.httpRequestGetContent(pagespeed_api_request)
        get_content = BeautifulSoup(get_content, "html.parser")
        get_content = str(get_content.encode("ascii"))
    except:  # breaking and hoping for more luck with the next URL
        print(
            'Error! Unfortunately the request for URL "{0}" failed, message:\n{1}'.format(
                check_url, sys.exc_info()[0]))
        pass
    # try:
    get_content = get_content[2:][:-1]  # removes two first chars and the last one
    get_content = get_content.replace('\\n', '\n').replace("\\'",
                                                           "\'")  # .replace('"', '"') #.replace('\'', '\"')
    get_content = get_content.replace('\\\\"', '\\"').replace('""', '"')

    json_content = ''
    try:
        json_content = json.loads(get_content)
    except:  # might crash if checked resource is not a webpage
        print('Error! JSON failed parsing for the URL "{0}"\nMessage:\n{1}'.format(
            check_url, sys.exc_info()[0]))
        pass

    return_dict = {}
    try:
        # overall score
        for key in json_content['ruleGroups'].keys():
            # print('Key: {0}, value {1}'.format(key, json_content['ruleGroups'][key]['score']))
            return_dict[key] = json_content['ruleGroups'][key]['score']

        # page statistics
        for key in json_content['pageStats'].keys():
            # print('Key: {0}, value {1}'.format(key, json_content['pageStats'][key]))
            return_dict[key] = json_content['pageStats'][key]

        # page potential
        for key in json_content['formattedResults']['ruleResults'].keys():
            # print('Key: {0}, value {1}'.format(key, json_content['formattedResults']['ruleResults'][key]['ruleImpact']))
            return_dict[key] = json_content['formattedResults']['ruleResults'][key]['ruleImpact']
        return return_dict
    except:
        print('Error! Request for URL "{0}" failed.\nMessage:\n{1}'.format(check_url,
                                                                           sys.exc_info()[
                                                                               0]))
        pass
def content_check(check_url, strategy='mobile'):
    """
    Checks the Pagespeed Insights with Google 
    In addition to the 'mobile' strategy there is also 'desktop' aimed at the desktop user's preferences
    Returns a dictionary of the results.

    attributes: check_url, strategy
    """
    check_url = check_url.strip()
    return_dict = {}

    try:
        get_content = helper.httpRequestGetContent(check_url)
        soup = BeautifulSoup(get_content, "html.parser")
        # soup = soup.encode("ascii")

        pagetitle = soup.title.string
        return_dict['pagetitle'] = '"{0}"'.format(pagetitle)
        pagetitle_length = len(pagetitle)
        return_dict['pagetitle_length'] = pagetitle_length
        num_links = len(soup.find_all('a'))
        return_dict['num_links'] = num_links

        # checking images
        num_images = len(soup.find_all('img'))
        return_dict['num_images'] = num_images

        images = soup.find_all('img')
        i = 0
        for image in images:
            if image.get('alt') is not None:
                i = i + 1
            # print(image.get('alt')) # for debugging

        num_images_without_alt = num_images - i
        return_dict['num_images_without_alt'] = num_images_without_alt

        try:
            meta_desc = soup.findAll(
                attrs={"name": "description"})[0]['content']
            return_dict['meta_desc'] = '"{0}"'.format(meta_desc)
            meta_desc_length = len(meta_desc)
            return_dict['meta_desc_length'] = meta_desc_length
        except IndexError:
            return_dict['meta_desc'] = ''
            return_dict['meta_desc_length'] = 0
            pass
        except:
            print('Meta desc check for URL \'{0}\' failed, reason: {1}'.format(
                check_url,
                sys.exc_info()[0]))

        # checking readability
        [
            s.extract()
            for s in soup(['style', 'script', '[document]', 'head', 'title'])
        ]

        if 1 is 1:
            # if you want get readability for the whole page then the statement above should read "if 1 is 1:", otherwise "if 1 is 2:" to enter else below
            visible_text = soup.getText()
        else:
            # attribute "main" might in your code be a "div", "pagecontent" is the class where you want to get the content from. CHANGE IT to what ever you are using.
            visible_text = soup.find("main", class_="main-wrapper").getText()

            visible_text = "?\n".join(visible_text.split("?"))
            visible_text = "!\n".join(visible_text.split("!"))
            visible_text = ".\n".join(visible_text.split("."))

        file_name = 'tmp/{0}_{1}_{2}.txt'.format(
            str(datetime.today())[:10], 'contentCheck', helper.getUniqueId())
        helper.writeFile(file_name, visible_text)
        # readability = os.system('readability {0}'.format(file_name))
        readability = subprocess.check_output(['readability', file_name])
        readability = readability.decode("utf-8")

        helper.delete_file(
            file_name
        )  # uncomment if you'd like to see the text files that are used
        # helper.writeFile('tmp/readability-output.txt', readability) # uncomment if you'd like to see the readability output

        for line in readability.split('\n'):
            # first_entry = line.split(':')[0].strip()
            try:
                return_dict[line.split(':')[0].strip()] = line.split(
                    ':')[1].strip()
            except:
                pass

            # print(meta_desc)

    except:  # breaking and hoping for more luck with the next URL
        print(
            'Error! Unfortunately the request for URL "{0}" failed, message:\n{1}'
            .format(check_url,
                    sys.exc_info()[0]))
        pass

    return return_dict
def oneOffProcess(file, test_regime='httpStatusCodeCheck'):
    """
    Inspects a textfile, assuming there's URLs in there, one URL per line.
    
    attributes: file path to open
    """
    f = open(file, 'r')

    urlsInTextfile = []
    iteration_counter = 1
    keep_on = True
    time_to_sleep_in_seconds = 90  # TODO: reda ut varför Mobile Friendly inte orkar testa flera på raken, begränsning?

    output_file = ""
    i = 1

    while keep_on:
        url = f.readline().replace('\n', '')
        mess_to_console = '{0}. {1}'.format(iteration_counter, url)

        if len(url) < 7:  # break if line is shorter than seven characters
            keep_on = False
        elif not url.endswith('.pdf'):
            # depending on which test regime is chosen
            if test_regime == 'httpStatusCodeCheck':
                status_code = test.httpStatusCodeCheck(url, False)
                print('{0} has a status code: {1}'.format(
                    mess_to_console, status_code).replace('\n', ''))
                output_file += '{0}, {1}\n'.format(url.replace('\n', ''),
                                                   status_code)
            elif test_regime == 'sitemapCheck':
                """
                Check the status code of domain.tld/sitemap.xml, assuming URL to only be the domain, not an URI
                """
                if url[-1:] is '/':
                    url = url[:-1]

                url = '{0}/{1}'.format(url, 'sitemap.xml')
                status_code = test.httpStatusCodeCheck(url, False)
                print('{0} has a status code: {1}'.format(
                    mess_to_console, status_code).replace('\n', ''))
                is_sitemap = "undefined"
                if str(status_code)[:1] is "2" or str(
                        status_code
                )[:1] is "3":  # checking if status code is either 200 series or 300
                    is_sitemap = helper.is_sitemap(
                        helper.httpRequestGetContent(url))
                    print('Is sitemap: {0}'.format(is_sitemap))
                output_file += '{0}, {1}, {2}\n'.format(
                    url.replace('\n', ''), status_code, is_sitemap)
            elif test_regime == 'urlHarvest':
                """
                Fetches URLs from a page's content
                """
                i = 0
                print('Harvesting URLs from {0}'.format(url))
                try:
                    for found_url in helper.fetchUrlsFromPage(url, 50):
                        output_file += '{0}\n'.format(found_url)
                        i += 1
                except:
                    print('Error! The URL {0} failed.'.format(url))
                    pass
                #print('Found {0} URLs from {1}'.format(i,url))
            elif test_regime == 'googlePageSpeed':
                check_page = check_lighthouse(url)
                if bool(check_page):
                    print('{0} has been checked against Google Pagespeed API'.
                          format(mess_to_console))
                    for key in check_page:
                        output_file = output_file + '{0},{1},{2}\n'.format(
                            url, key, check_page[key])
            elif test_regime == 'mobileFriendlyCheck':
                print(url)
                status_message = test.mobileFriendlyCheck(
                    url, privatekeys.googleMobileFriendlyApiKey)
                print(
                    "Mobile-friendliness of URL '{0}' were evaluated as: {1}".
                    format(url, status_message))
                output_file += '{0}, {1}\n'.format(url.replace('\n', ''),
                                                   status_message)
                sleep(time_to_sleep_in_seconds)  # sleeping for n seconds
            elif test_regime == 'contentCheck':
                print("{0}. Checking content of URL '{1}'.".format(i, url))
                for key, value in content_check(url).items():
                    output_file = output_file + '{0},{1},{2}\n'.format(
                        url, key, value)
                i = i + 1
            elif test_regime == 'findString':
                searching = find_string('piwik', url)
                print("{0}. Checking for string in URL '{1}' - {2}".format(
                    i, url, searching))
                output_file = output_file + '{0},{1}\n'.format(url, searching)
                i = i + 1

            # sleep(time_to_sleep_in_seconds)  # sleeping for n seconds

            urlsInTextfile.append(url)
            iteration_counter += 1

    f.close()

    ### Writing the report
    file_name = 'rapporter/{0}_{1}_{2}.csv'.format(
        str(datetime.today())[:10], test_regime, helper.getUniqueId())
    helper.writeFile(file_name, output_file)

    print('The report has now been written to a file named: {0}'.format(
        file_name))