Exemplo n.º 1
0
def xhtml_scanner(website):
    """Scan website for using XHTML

    :param website: website to scan
    :return ScannerAttribute:
    """
    try:
        etree.XML(website.html)
        return ScannerAttribute('xhtml', 1, 1, [0, 1])
    except:
        return ScannerAttribute('xhtml', 0, 0, [0, 1])
Exemplo n.º 2
0
def title_scanner(website):
    """Scan website for using the title meta-tag

    :param website: website to scan
    :return ScannerAttribute: """
    key = 'title_tag'
    title = website.soup.title

    if title is None or title.string is None or title.string.strip() == '':
        return ScannerAttribute(key, 0, 0, [0, 1])
    return ScannerAttribute(key, 1, 1, [0, 1])
Exemplo n.º 3
0
def twitter_share_scanner(website):
    """Scan website for displaying a Twitter share

    :param website: website to scan
    :return ScannerAttribute:
    """
    key = 'twitter_share'
    tw_classes = [
        'twitter-share-button', 'twitter-follow-button',
        'twitter-hashtag-button', 'twitter-mention-button'
    ]
    for tw_class in tw_classes:
        if len(website.soup.find_all('a', {'class': tw_class})) > 0:
            return ScannerAttribute(key, 1, 1, [0, 1])
    return ScannerAttribute(key, 0, 0, [0, 1])
Exemplo n.º 4
0
def facebook_share_scanner(website):
    """Scan website for displaying a Facebook share

    :param website: website to scan
    :return ScannerAttribute:
    """
    key = 'facebook_share'
    fb_classes = [
        'fb-like', 'fb-send', 'fb-follow', 'fb-comments', 'fb-activity',
        'fb-recommendations', 'fb-recommendations-bar', 'fb-facepile',
        'fb-like-box', 'fb-login-button'
    ]
    for fb_class in fb_classes:
        if len(website.soup.find_all('div', {'class': fb_class})) > 0:
            return ScannerAttribute(key, 1, 1, [0, 1])
    return ScannerAttribute(key, 0, 0, [0, 1])
Exemplo n.º 5
0
def alexa_rank_scanner(website):
    """Return the Alexa global rank."""
    # Mapping:
    # 0 - 99: 0
    # 100 - 499: 1
    # ...
    bins = [1000, 5000, 10000, 25000, 50000, 100000, 250000, sys.maxsize]
    rank = website.alexa_rank
    return ScannerAttribute('alexa_rank', rank, bin_numeric(bins, rank), bins)
Exemplo n.º 6
0
def alexa_links_ins_scanner(website):
    """Return the Alexa estimate of incoming links."""
    # Mapping:
    # 10001 - inf: 0
    # 7501 - 10000: 1
    # ...
    bins = [10000, 7500, 5000, 2500, 1000, 500, 0]
    links_in = website.alexa_links_in
    return ScannerAttribute('alexa_links_in', links_in,
                            bin_numeric_desc(bins, links_in), bins)
Exemplo n.º 7
0
def url_scanner(website):
    """Scan website for its url

    Trivial, but we want to use the website url as id,
    so it is scanned from the website as any other attribute.

    :param website: website to scan
    :return ScannerAttribute:
    """
    return ScannerAttribute('url', website.url)
Exemplo n.º 8
0
def image_count_scanner(website):
    """Scan website number of images

    :param website: website to scan
    :return ScannerAttribute:
    """
    count = len(website.soup.find_all('img'))
    bins = [(0, 5), (5, 10), (10, 15), (15, 20), (20, sys.maxsize)]
    return ScannerAttribute('img_count', count,
                            index_of_interval_bin(bins, count), bins)
Exemplo n.º 9
0
def analytics_scanner(website):
    """Checks whether or not website uses Google Analytics"""
    has_analytics = False
    scripts = website.soup.find_all('script')
    for script in scripts:
        if 'google-analytics.com/ga.js' in script.get_text(
        ):  # We simply look for the ga.js file
            has_analytics = True
            break
    return ScannerAttribute('has_analytics', int(has_analytics),
                            int(has_analytics), [0, 1])
Exemplo n.º 10
0
def page_rank_scanner(website):
    """Scan website for its PageRank.

    This number is actually set on the website object,
    and we add it to the proprocessing by providing a
    scanner

    :param website: website to scan
    :return ScannerAttribute:
    """
    bins = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, sys.maxsize]
    rank = website.google_page_rank
    return ScannerAttribute('page_rank', rank, bin_numeric(bins, rank), bins)
Exemplo n.º 11
0
def html5_tag_scanner(website):
    """Scan website for the number of HTML 5 tags

    :param website: website to scan
    :return ScannerAttribute:
    """
    count = 0
    for tag in html5_tags:
        count += len(website.soup.find_all(tag))

    bins = [1, 10, 50, sys.maxsize]
    return ScannerAttribute('html5_tags', count, bin_numeric(bins, count),
                            bins)
Exemplo n.º 12
0
def html5_scanner(website):
    """Scan website for the using HTML 5.

    :param website: website to scan
    :return ScannerAttribute:
    """
    is_found = False
    for tag in html5_tags:
        if len(website.soup.find_all(tag)) > 0:
            is_found = True
            break
    if '<!DOCTYPE html>' in website.html:
        is_found = True

    return ScannerAttribute('html5', int(is_found), int(is_found), [0, 1])
Exemplo n.º 13
0
def server_scanner(website):
    """Scan website for its server type

    :param website: website to scan
    :return ScannerAttribute:
    """
    server = ''
    if website.headers.has_key('server'):
        server = website.headers['server'].strip().upper()
    elif website.headers.has_key('Server'):
        server = website.headers['Server'].strip().upper()
    index = index_of_discrete_bin(bins(), server)
    if index == -1:
        index = bins().index('UNKNOWN')
    else:
        pass
    return ScannerAttribute('server', server, index, bins())
Exemplo n.º 14
0
def keyword_scanner(website):
    """Scan website for using the keywords HTML meta tag

    :param website: website to scan
    :return ScannerAttribute:
    """
    has_keywords = False
    meta_tags = website.soup.find_all('meta')
    for meta_tag in meta_tags:
        if meta_tag.has_key('name') and meta_tag.has_key('content'):
            # Keys exist, get the values
            name = meta_tag['name']
            content = meta_tag['content']
            if 'keywords' in name.lower():
                has_keywords = 0 < len(content)
        if has_keywords:
            break
    return ScannerAttribute('has_keywords', int(has_keywords),
                            int(has_keywords), [0, 1])
Exemplo n.º 15
0
def internal_links_scanner(website):
    internal_count, external_count = links_count(website)
    return ScannerAttribute('internal_links_count', internal_count,
                            index_of_interval_bin(bins(), internal_count),
                            bins())
Exemplo n.º 16
0
def alexa_rank_dk_scanner(website):
    """Return the Alexa Danish rank."""
    bins = [10, 50, 250, 1000, 5000, sys.maxsize]
    rank = website.alexa_rank_dk
    return ScannerAttribute('alexa_rank_dk', rank, bin_numeric(bins, rank),
                            bins)
Exemplo n.º 17
0
def alexa_load_time_scanner(website):
    """Return the Alexa everage load time."""
    bins = [500, 1000, 1500, 2000, 2500, sys.maxsize]
    load_time = website.alexa_load_time
    return ScannerAttribute('alexa_load_time', load_time,
                            bin_numeric(bins, load_time), bins)
Exemplo n.º 18
0
def alexa_lang_scanner(website):
    """Return the language according to Alexa."""
    bins = ['dk', 'en', 'sv', 'no', 'de', 'fr']
    lang = website.alexa_lang
    return ScannerAttribute('alexa_lang', lang, bin_fuzzy_text(bins, lang),
                            bins)
Exemplo n.º 19
0
def alexa_has_adult_content(website):
    """Return the Alexa estimate of whether or not a page contains adult content."""
    has_adult_content = website.alexa_adult_content
    return ScannerAttribute('alexa_has_adult_content', int(has_adult_content),
                            int(has_adult_content), [0, 1])
Exemplo n.º 20
0
def found(cms):
    """Utility method which returns which CMS was found."""
    return ScannerAttribute('cms', cms, index_of_discrete_bin(bins(), cms), bins())