def find_string(to_find, check_url): check_url = check_url.strip() try: get_content = helper.httpRequestGetContent(check_url) # soup = BeautifulSoup(get_content, "html.parser") # to use if not checking HTML code if to_find in get_content: return True else: return False except: print( 'Error! Unfortunately the request for URL "{0}" failed, message:\n{1}' .format(check_url, sys.exc_info()[0])) pass # För svenska # Under 25 Barnböcker. # 25 till 30 Enkla texter. # 30 till 40 Normaltext / skönlitteratur. # 40 till 50 Sakinformation, till exempel Wikipedia. # 50 till 60 Facktexter. # Över 60 Svåra facktexter / forskning / avhandlingar. # för engelska # https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
def check_lighthouse(url, strategy='mobile', category='performance'): """ perf = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=performance&strategy=mobile&url=YOUR-SITE&key=YOUR-KEY a11y = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=accessibility&strategy=mobile&url=YOUR-SITE&key=YOUR-KEY practise = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=best-practices&strategy=mobile&url=YOUR-SITE&key=YOUR-KEY pwa = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=pwa&strategy=mobile&url=YOUR-SITE&key=YOUR-KEY seo = https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category=seo&url=YOUR-SITE&key=YOUR-KEY """ check_url = url.strip() pagespeed_api_request = 'https://www.googleapis.com/pagespeedonline/v5/runPagespeed?category={0}&url={1}&key={2}'.format(category, check_url, privatekeys.googlePageSpeedApiKey) get_content = '' try: get_content = helper.httpRequestGetContent(pagespeed_api_request) except: # breaking and hoping for more luck with the next URL print( 'Error! Unfortunately the request for URL "{0}" failed, message:\n{1}'.format( check_url, sys.exc_info()[0])) pass #print('Checked \'{0}\' successfully against Google\'s API!'.format(pagespeed_api_request)) json_content = '' try: json_content = json.loads(get_content) except: # might crash if checked resource is not a webpage print('Error! JSON failed parsing for the URL "{0}"\nMessage:\n{1}'.format( check_url, sys.exc_info()[0])) pass #print(json_content) return_dict = {} return_dict = json_content['lighthouseResult']['audits']['metrics']['details']['items'][0] for item in json_content['lighthouseResult']['audits'].keys(): try: return_dict[item] = json_content['lighthouseResult']['audits'][item]['numericValue'] except: # has no 'numericValue' #print(item, 'har inget värde') pass return return_dict
def thirdPartiesCheck(url): """Checking third parties used on the URL Attributes: string url """ get_content = helper.httpRequestGetContent(url) get_content = BeautifulSoup(get_content, "html.parser") for findings in get_content.select('img[src*="//"]'): print(findings) for findings in get_content.select('iframe[src*="//"]'): print(findings) for findings in get_content.select('link[href*="//"]'): print(findings) for findings in get_content.select('script[src*="//"]'): print(findings) for findings in get_content.select('script[src*="//"]'): print(findings)
def google_pagespeed_check(check_url, strategy='mobile'): """Checks the Pagespeed Insights with Google In addition to the 'mobile' strategy there is also 'desktop' aimed at the desktop user's preferences Returns a dictionary of the results. attributes: check_url, strategy """ check_url = check_url.strip() # urlEncodedURL = parse.quote_plus(check_url) # making sure no spaces or other weird characters f*cks up the request, such as HTTP 400 pagespeed_api_request = 'https://www.googleapis.com/pagespeedonline/v4/runPagespeed?url={}&strategy={}&key={}'.format( check_url, strategy, privatekeys.googlePageSpeedApiKey) # print('HTTP request towards GPS API: {}'.format(pagespeed_api_request)) responsecontents = "" get_content = "" try: get_content = helper.httpRequestGetContent(pagespeed_api_request) get_content = BeautifulSoup(get_content, "html.parser") get_content = str(get_content.encode("ascii")) except: # breaking and hoping for more luck with the next URL print( 'Error! Unfortunately the request for URL "{0}" failed, message:\n{1}'.format( check_url, sys.exc_info()[0])) pass # try: get_content = get_content[2:][:-1] # removes two first chars and the last one get_content = get_content.replace('\\n', '\n').replace("\\'", "\'") # .replace('"', '"') #.replace('\'', '\"') get_content = get_content.replace('\\\\"', '\\"').replace('""', '"') json_content = '' try: json_content = json.loads(get_content) except: # might crash if checked resource is not a webpage print('Error! JSON failed parsing for the URL "{0}"\nMessage:\n{1}'.format( check_url, sys.exc_info()[0])) pass return_dict = {} try: # overall score for key in json_content['ruleGroups'].keys(): # print('Key: {0}, value {1}'.format(key, json_content['ruleGroups'][key]['score'])) return_dict[key] = json_content['ruleGroups'][key]['score'] # page statistics for key in json_content['pageStats'].keys(): # print('Key: {0}, value {1}'.format(key, json_content['pageStats'][key])) return_dict[key] = json_content['pageStats'][key] # page potential for key in json_content['formattedResults']['ruleResults'].keys(): # print('Key: {0}, value {1}'.format(key, json_content['formattedResults']['ruleResults'][key]['ruleImpact'])) return_dict[key] = json_content['formattedResults']['ruleResults'][key]['ruleImpact'] return return_dict except: print('Error! Request for URL "{0}" failed.\nMessage:\n{1}'.format(check_url, sys.exc_info()[ 0])) pass
def content_check(check_url, strategy='mobile'): """ Checks the Pagespeed Insights with Google In addition to the 'mobile' strategy there is also 'desktop' aimed at the desktop user's preferences Returns a dictionary of the results. attributes: check_url, strategy """ check_url = check_url.strip() return_dict = {} try: get_content = helper.httpRequestGetContent(check_url) soup = BeautifulSoup(get_content, "html.parser") # soup = soup.encode("ascii") pagetitle = soup.title.string return_dict['pagetitle'] = '"{0}"'.format(pagetitle) pagetitle_length = len(pagetitle) return_dict['pagetitle_length'] = pagetitle_length num_links = len(soup.find_all('a')) return_dict['num_links'] = num_links # checking images num_images = len(soup.find_all('img')) return_dict['num_images'] = num_images images = soup.find_all('img') i = 0 for image in images: if image.get('alt') is not None: i = i + 1 # print(image.get('alt')) # for debugging num_images_without_alt = num_images - i return_dict['num_images_without_alt'] = num_images_without_alt try: meta_desc = soup.findAll( attrs={"name": "description"})[0]['content'] return_dict['meta_desc'] = '"{0}"'.format(meta_desc) meta_desc_length = len(meta_desc) return_dict['meta_desc_length'] = meta_desc_length except IndexError: return_dict['meta_desc'] = '' return_dict['meta_desc_length'] = 0 pass except: print('Meta desc check for URL \'{0}\' failed, reason: {1}'.format( check_url, sys.exc_info()[0])) # checking readability [ s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title']) ] if 1 is 1: # if you want get readability for the whole page then the statement above should read "if 1 is 1:", otherwise "if 1 is 2:" to enter else below visible_text = soup.getText() else: # attribute "main" might in your code be a "div", "pagecontent" is the class where you want to get the content from. CHANGE IT to what ever you are using. visible_text = soup.find("main", class_="main-wrapper").getText() visible_text = "?\n".join(visible_text.split("?")) visible_text = "!\n".join(visible_text.split("!")) visible_text = ".\n".join(visible_text.split(".")) file_name = 'tmp/{0}_{1}_{2}.txt'.format( str(datetime.today())[:10], 'contentCheck', helper.getUniqueId()) helper.writeFile(file_name, visible_text) # readability = os.system('readability {0}'.format(file_name)) readability = subprocess.check_output(['readability', file_name]) readability = readability.decode("utf-8") helper.delete_file( file_name ) # uncomment if you'd like to see the text files that are used # helper.writeFile('tmp/readability-output.txt', readability) # uncomment if you'd like to see the readability output for line in readability.split('\n'): # first_entry = line.split(':')[0].strip() try: return_dict[line.split(':')[0].strip()] = line.split( ':')[1].strip() except: pass # print(meta_desc) except: # breaking and hoping for more luck with the next URL print( 'Error! Unfortunately the request for URL "{0}" failed, message:\n{1}' .format(check_url, sys.exc_info()[0])) pass return return_dict
def oneOffProcess(file, test_regime='httpStatusCodeCheck'): """ Inspects a textfile, assuming there's URLs in there, one URL per line. attributes: file path to open """ f = open(file, 'r') urlsInTextfile = [] iteration_counter = 1 keep_on = True time_to_sleep_in_seconds = 90 # TODO: reda ut varför Mobile Friendly inte orkar testa flera på raken, begränsning? output_file = "" i = 1 while keep_on: url = f.readline().replace('\n', '') mess_to_console = '{0}. {1}'.format(iteration_counter, url) if len(url) < 7: # break if line is shorter than seven characters keep_on = False elif not url.endswith('.pdf'): # depending on which test regime is chosen if test_regime == 'httpStatusCodeCheck': status_code = test.httpStatusCodeCheck(url, False) print('{0} has a status code: {1}'.format( mess_to_console, status_code).replace('\n', '')) output_file += '{0}, {1}\n'.format(url.replace('\n', ''), status_code) elif test_regime == 'sitemapCheck': """ Check the status code of domain.tld/sitemap.xml, assuming URL to only be the domain, not an URI """ if url[-1:] is '/': url = url[:-1] url = '{0}/{1}'.format(url, 'sitemap.xml') status_code = test.httpStatusCodeCheck(url, False) print('{0} has a status code: {1}'.format( mess_to_console, status_code).replace('\n', '')) is_sitemap = "undefined" if str(status_code)[:1] is "2" or str( status_code )[:1] is "3": # checking if status code is either 200 series or 300 is_sitemap = helper.is_sitemap( helper.httpRequestGetContent(url)) print('Is sitemap: {0}'.format(is_sitemap)) output_file += '{0}, {1}, {2}\n'.format( url.replace('\n', ''), status_code, is_sitemap) elif test_regime == 'urlHarvest': """ Fetches URLs from a page's content """ i = 0 print('Harvesting URLs from {0}'.format(url)) try: for found_url in helper.fetchUrlsFromPage(url, 50): output_file += '{0}\n'.format(found_url) i += 1 except: print('Error! The URL {0} failed.'.format(url)) pass #print('Found {0} URLs from {1}'.format(i,url)) elif test_regime == 'googlePageSpeed': check_page = check_lighthouse(url) if bool(check_page): print('{0} has been checked against Google Pagespeed API'. format(mess_to_console)) for key in check_page: output_file = output_file + '{0},{1},{2}\n'.format( url, key, check_page[key]) elif test_regime == 'mobileFriendlyCheck': print(url) status_message = test.mobileFriendlyCheck( url, privatekeys.googleMobileFriendlyApiKey) print( "Mobile-friendliness of URL '{0}' were evaluated as: {1}". format(url, status_message)) output_file += '{0}, {1}\n'.format(url.replace('\n', ''), status_message) sleep(time_to_sleep_in_seconds) # sleeping for n seconds elif test_regime == 'contentCheck': print("{0}. Checking content of URL '{1}'.".format(i, url)) for key, value in content_check(url).items(): output_file = output_file + '{0},{1},{2}\n'.format( url, key, value) i = i + 1 elif test_regime == 'findString': searching = find_string('piwik', url) print("{0}. Checking for string in URL '{1}' - {2}".format( i, url, searching)) output_file = output_file + '{0},{1}\n'.format(url, searching) i = i + 1 # sleep(time_to_sleep_in_seconds) # sleeping for n seconds urlsInTextfile.append(url) iteration_counter += 1 f.close() ### Writing the report file_name = 'rapporter/{0}_{1}_{2}.csv'.format( str(datetime.today())[:10], test_regime, helper.getUniqueId()) helper.writeFile(file_name, output_file) print('The report has now been written to a file named: {0}'.format( file_name))