Wappalyzer.py

import json
import urllib2
import warnings

import requests

try:
    import re2 as re
except ImportError:
    import re

try:
    from lxml.html import fromstring
except ImportError, e:
    from BeautifulSoup import BeautifulSoup
    use_lxml = False
else:
    use_lxml = True


APPS_JSON_URL = 'https://raw.github.com/ElbertF/Wappalyzer/master/share/apps.json'


class WappalyzerError(Exception):
    """
    Raised for fatal Wappalyzer errors.
    """
    pass


class WebPage(object):
    """
    Simple representation of a web page, decoupled
    from any particular HTTP library's API.
    """

    def __init__(self, url, html, headers, parsed_html=None):
        """
        Initialize a new WebPage object.

        Parameters
        ----------

        url : str
            The web page URL.
        html : str
            The web page content (HTML)
        headers : dict
            The HTTP response headers
        """
        self.url = url
        self.html = html
        self.headers = headers

        self.parsed_html = parsed_html
        self._parse_html()

    def _parse_html(self):
        """
        Parse the HTML with lxml to find <script> and <meta> tags.
        """

        if use_lxml:
            self.parsed_html = self.parsed_html or fromstring(self.html)
            self.scripts = self.parsed_html.xpath('//script/@src')
            self.meta = {
                meta.attrib['name'].lower(): meta.attrib['content']
                for meta in self.parsed_html.xpath('//meta[@name and @content]')
            }
        else:
            self.parsed_html = soup = BeautifulSoup(self.html)
            self.scripts = [ script['src'] for script in soup.findAll('script', src=True) ]
            self.meta = { meta['name'].lower(): meta['content'] for meta in soup.findAll('meta', attrs=dict(name=True, content=True)) }

    @classmethod
    def new_from_url(cls, url):
        """
        Constructs a new WebPage object for the URL,
        using the `requests` module to fetch the HTML.

        Parameters
        ----------

        url : str
        """
        response = requests.get(url)
        return cls.new_from_response(response)

    @classmethod
    def new_from_response(cls, response):
        """
        Constructs a new WebPage object for the response,
        using the `BeautifulSoup` module to parse the HTML.

        Parameters
        ----------

        response : requests.Response object
        """
        return cls(response.url, html=response.text, headers=response.headers)


class Wappalyzer(object):
    """
    Python Wappalyzer driver.
    """
    def __init__(self, categories, apps):
        """
        Initialize a new Wappalyzer instance.

        Parameters
        ----------

        categories : dict
            Map of category ids to names, as in apps.json.
        apps : dict
            Map of app names to app dicts, as in apps.json.
        """
        self.categories = categories
        self.apps = apps

        for name, app in self.apps.items():
            self._prepare_app(app)

    @classmethod
    def latest(cls):
        """
        Construct a Wappalyzer instance using the latest
        version of apps.json, as fetched from GitHub.
        """
        fd = urllib2.urlopen(APPS_JSON_URL)
        obj = json.load(fd)
        return cls(categories=obj['categories'], apps=obj['apps'])

    def _prepare_app(self, app):
        """
        Normalize app data, preparing it for the detection phase.
        """

        # Ensure these keys' values are lists
        for key in ['url', 'html', 'script', 'implies']:
            try:
                value = app[key]
            except KeyError:
                app[key] = []
            else:
                if not isinstance(value, list):
                    app[key] = [value]

        # Ensure these keys exist
        for key in ['headers', 'meta']:
            try:
                value = app[key]
            except KeyError:
                app[key] = {}            

        # Ensure the 'meta' key is a dict
        obj = app['meta']
        if not isinstance(obj, dict):
            app['meta'] = { 'generator': obj }

        # Ensure keys are lowercase
        for key in ['headers', 'meta']:
            obj = app[key]
            app[key] = { k.lower(): v for k,v in obj.items() }

        # Prepare regular expression patterns
        for key in ['url', 'html', 'script']:
            app[key] = [self._prepare_pattern(pattern) for pattern in app[key]]

        for key in ['headers', 'meta']:
            obj = app[key]
            for name, pattern in obj.items():
                obj[name] = self._prepare_pattern(obj[name])

    def _prepare_pattern(self, pattern):
        """
        Strip out key:value pairs from the pattern and compile the regular expression.
        """
        regex, _, rest = pattern.partition('\\;')
        try:
            return re.compile(regex, re.I)
        except re.error as e:
            warnings.warn("Caught '{error}' compiling regex: {regex}".format(error=e, regex=regex))
            return re.compile(r'(?!x)x') # regex that never matches: http://stackoverflow.com/a/1845097/413622

    def _has_app(self, app, webpage):
        """
        Determine whether the web page matches the app signature.
        """
        # Search the easiest things first and save the full-text search of the HTML for last

        for regex in app['url']:
            if regex.search(webpage.url):
                return True

        for name, regex in app['headers'].items():
            if name in webpage.headers:
                content = webpage.headers[name]
                if regex.search(content):
                    return True

        for regex in app['script']:
            for script in webpage.scripts:
                if regex.search(script):
                    return True

        for name, regex in app['meta'].items():
            if name in webpage.meta:
                content = webpage.meta[name]
                if regex.search(content):
                    return True

        for regex in app['html']:
            if regex.search(webpage.html):
                return True

    def _get_implied_apps(self, detected_apps):
        """
        Get the set of apps implied by `detected_apps`.
        """
        def __get_implied_apps(apps):
            _implied_apps = set()
            for app in apps:
                try:
                    _implied_apps.update(self.apps[app]['implies'])
                except KeyError:
                    continue
            return _implied_apps

        implied_apps = __get_implied_apps(detected_apps)
        all_implied_apps = set()

        # Descend recursively until we've found all implied apps
        while not all_implied_apps.issuperset(implied_apps):
            all_implied_apps.update(implied_apps)
            implied_apps = __get_implied_apps(all_implied_apps)

        return all_implied_apps


    def analyze(self, webpage, cats=None, apps=None, blacklist=[], include=None):
        """
        Return a list of applications that can be detected on the web page.

        :param webpage: :class:`WebPage` to analyze
        :param cats: List of category names to include (default all)
        :param blacklist: List of blacklisted apps (default none)
        :param include: List of included apps (default all)
        """

        detected_apps = set()

        for app_name, app in [
            app for app in self.apps.items()
            if app not in blacklist or (include and app in include)
        ]:
            # Filter speciffic categories
            if cats and not next(
                (cat for cat in app["cats"] if self.categories[cat] in cats),
                None
            ):
                continue

            if self._has_app(app, webpage):
                detected_apps.add(app_name)

        detected_apps |= self._get_implied_apps(detected_apps)

        return detected_apps