def xhtml_scanner(website): """Scan website for using XHTML :param website: website to scan :return ScannerAttribute: """ try: etree.XML(website.html) return ScannerAttribute('xhtml', 1, 1, [0, 1]) except: return ScannerAttribute('xhtml', 0, 0, [0, 1])
def title_scanner(website): """Scan website for using the title meta-tag :param website: website to scan :return ScannerAttribute: """ key = 'title_tag' title = website.soup.title if title is None or title.string is None or title.string.strip() == '': return ScannerAttribute(key, 0, 0, [0, 1]) return ScannerAttribute(key, 1, 1, [0, 1])
def twitter_share_scanner(website): """Scan website for displaying a Twitter share :param website: website to scan :return ScannerAttribute: """ key = 'twitter_share' tw_classes = [ 'twitter-share-button', 'twitter-follow-button', 'twitter-hashtag-button', 'twitter-mention-button' ] for tw_class in tw_classes: if len(website.soup.find_all('a', {'class': tw_class})) > 0: return ScannerAttribute(key, 1, 1, [0, 1]) return ScannerAttribute(key, 0, 0, [0, 1])
def facebook_share_scanner(website): """Scan website for displaying a Facebook share :param website: website to scan :return ScannerAttribute: """ key = 'facebook_share' fb_classes = [ 'fb-like', 'fb-send', 'fb-follow', 'fb-comments', 'fb-activity', 'fb-recommendations', 'fb-recommendations-bar', 'fb-facepile', 'fb-like-box', 'fb-login-button' ] for fb_class in fb_classes: if len(website.soup.find_all('div', {'class': fb_class})) > 0: return ScannerAttribute(key, 1, 1, [0, 1]) return ScannerAttribute(key, 0, 0, [0, 1])
def alexa_rank_scanner(website): """Return the Alexa global rank.""" # Mapping: # 0 - 99: 0 # 100 - 499: 1 # ... bins = [1000, 5000, 10000, 25000, 50000, 100000, 250000, sys.maxsize] rank = website.alexa_rank return ScannerAttribute('alexa_rank', rank, bin_numeric(bins, rank), bins)
def alexa_links_ins_scanner(website): """Return the Alexa estimate of incoming links.""" # Mapping: # 10001 - inf: 0 # 7501 - 10000: 1 # ... bins = [10000, 7500, 5000, 2500, 1000, 500, 0] links_in = website.alexa_links_in return ScannerAttribute('alexa_links_in', links_in, bin_numeric_desc(bins, links_in), bins)
def url_scanner(website): """Scan website for its url Trivial, but we want to use the website url as id, so it is scanned from the website as any other attribute. :param website: website to scan :return ScannerAttribute: """ return ScannerAttribute('url', website.url)
def image_count_scanner(website): """Scan website number of images :param website: website to scan :return ScannerAttribute: """ count = len(website.soup.find_all('img')) bins = [(0, 5), (5, 10), (10, 15), (15, 20), (20, sys.maxsize)] return ScannerAttribute('img_count', count, index_of_interval_bin(bins, count), bins)
def analytics_scanner(website): """Checks whether or not website uses Google Analytics""" has_analytics = False scripts = website.soup.find_all('script') for script in scripts: if 'google-analytics.com/ga.js' in script.get_text( ): # We simply look for the ga.js file has_analytics = True break return ScannerAttribute('has_analytics', int(has_analytics), int(has_analytics), [0, 1])
def page_rank_scanner(website): """Scan website for its PageRank. This number is actually set on the website object, and we add it to the proprocessing by providing a scanner :param website: website to scan :return ScannerAttribute: """ bins = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, sys.maxsize] rank = website.google_page_rank return ScannerAttribute('page_rank', rank, bin_numeric(bins, rank), bins)
def html5_tag_scanner(website): """Scan website for the number of HTML 5 tags :param website: website to scan :return ScannerAttribute: """ count = 0 for tag in html5_tags: count += len(website.soup.find_all(tag)) bins = [1, 10, 50, sys.maxsize] return ScannerAttribute('html5_tags', count, bin_numeric(bins, count), bins)
def html5_scanner(website): """Scan website for the using HTML 5. :param website: website to scan :return ScannerAttribute: """ is_found = False for tag in html5_tags: if len(website.soup.find_all(tag)) > 0: is_found = True break if '<!DOCTYPE html>' in website.html: is_found = True return ScannerAttribute('html5', int(is_found), int(is_found), [0, 1])
def server_scanner(website): """Scan website for its server type :param website: website to scan :return ScannerAttribute: """ server = '' if website.headers.has_key('server'): server = website.headers['server'].strip().upper() elif website.headers.has_key('Server'): server = website.headers['Server'].strip().upper() index = index_of_discrete_bin(bins(), server) if index == -1: index = bins().index('UNKNOWN') else: pass return ScannerAttribute('server', server, index, bins())
def keyword_scanner(website): """Scan website for using the keywords HTML meta tag :param website: website to scan :return ScannerAttribute: """ has_keywords = False meta_tags = website.soup.find_all('meta') for meta_tag in meta_tags: if meta_tag.has_key('name') and meta_tag.has_key('content'): # Keys exist, get the values name = meta_tag['name'] content = meta_tag['content'] if 'keywords' in name.lower(): has_keywords = 0 < len(content) if has_keywords: break return ScannerAttribute('has_keywords', int(has_keywords), int(has_keywords), [0, 1])
def internal_links_scanner(website): internal_count, external_count = links_count(website) return ScannerAttribute('internal_links_count', internal_count, index_of_interval_bin(bins(), internal_count), bins())
def alexa_rank_dk_scanner(website): """Return the Alexa Danish rank.""" bins = [10, 50, 250, 1000, 5000, sys.maxsize] rank = website.alexa_rank_dk return ScannerAttribute('alexa_rank_dk', rank, bin_numeric(bins, rank), bins)
def alexa_load_time_scanner(website): """Return the Alexa everage load time.""" bins = [500, 1000, 1500, 2000, 2500, sys.maxsize] load_time = website.alexa_load_time return ScannerAttribute('alexa_load_time', load_time, bin_numeric(bins, load_time), bins)
def alexa_lang_scanner(website): """Return the language according to Alexa.""" bins = ['dk', 'en', 'sv', 'no', 'de', 'fr'] lang = website.alexa_lang return ScannerAttribute('alexa_lang', lang, bin_fuzzy_text(bins, lang), bins)
def alexa_has_adult_content(website): """Return the Alexa estimate of whether or not a page contains adult content.""" has_adult_content = website.alexa_adult_content return ScannerAttribute('alexa_has_adult_content', int(has_adult_content), int(has_adult_content), [0, 1])
def found(cms): """Utility method which returns which CMS was found.""" return ScannerAttribute('cms', cms, index_of_discrete_bin(bins(), cms), bins())