def should_whitelist(self, url, top_url):
        """Check if `url` is whitelisted on `top_url` due to the entitylist

        Parameters
        ----------
        url : string
            The URL or hostname to classify.
        top_url : string
            The URL or hostname of the top-level page on which `url` was loaded

        Returns
        -------
        boolean : True if the url would have been whitelisted by the entitylist
        """
        if not url.startswith('http'):
            url = 'http://' + url
        if not top_url.startswith('http'):
            top_url = 'http://' + top_url
        top_host = urlparse(top_url).hostname
        top_ps1 = du.get_ps_plus_1(top_url)
        url_host = urlparse(url).hostname
        url_ps1 = du.get_ps_plus_1(url)
        if top_host in self._entitylist:
            resources = self._entitylist[top_host]
        elif top_ps1 in self._entitylist:
            resources = self._entitylist[top_ps1]
        else:
            return False
        return url_host in resources or url_ps1 in resources
示例#2
0
def get_intra_links(webdriver, url):
    ps1 = du.get_ps_plus_1(url)
    links = filter(
        lambda x: (x.get_attribute("href") and du.get_ps_plus_1(
            urljoin(url, x.get_attribute("href"))) == ps1),
        webdriver.find_elements_by_tag_name("a"))
    return links
示例#3
0
def get_intra_links(webdriver, url):
    ps1 = du.get_ps_plus_1(url)
    links = list()
    for elem in webdriver.find_elements_by_tag_name("a"):
        try:
            href = elem.get_attribute("href")
        except StaleElementReferenceException:
            continue
        if href is None:
            continue
        full_href = urlparse.urljoin(url, href)
        if not full_href.startswith("http"):
            continue
        if du.get_ps_plus_1(full_href) == ps1:
            links.append(elem)
    return links
def get_intra_links(webdriver: WebDriver, url: str) -> List[WebElement]:
    """
    Get all links that lead to a subdomain.
    Ignores StaleElement Exceptions.
    """
    ps1 = du.get_ps_plus_1(url)
    links = list()
    for elem in webdriver.find_elements_by_tag_name("a"):
        try:
            href = elem.get_attribute('href')
        except StaleElementReferenceException:
            continue
        if href is None or type(href) is not str:
            continue
        full_href = urlparse.urljoin(url, href)
        if not full_href.startswith('http'):
            continue
        if du.get_ps_plus_1(full_href) == ps1:
            links.append(elem)
    return links
示例#5
0
def get_option_dict(url, top_level_url, resource_type=None):
    """Build an options dict for BlockListParser.
    
    These options are checked here:
    * https://github.com/englehardt/abp-blocklist-parser/blob/40f6bb5b91ea403b7b9852a16d6c57d5ec26cf7f/abp_blocklist_parser/RegexParser.py#L104-L117
    * https://github.com/englehardt/abp-blocklist-parser/blob/40f6bb5b91ea403b7b9852a16d6c57d5ec26cf7f/abp_blocklist_parser/RegexParser.py#L240-L248

    Parameters
    ----------
    url : string
        The URL of the requested resource.
    top_level_url : string
        The URL of the top-level frame of the requested resource
    resource_type : string
        All possible values are here https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/API/webRequest/ResourceType

    Returns
    -------
    dict
        An "options" dictionary for use with BlockListParser
    """
    options = {}
    # Add type option. Value doesn't matter.
    if resource_type:
        try:
            options[type_to_option[resource_type]] = True
        except KeyError:
            raise ValueError(
                "Argument %s given for `resource_type` not found in map." %
                resource_type)
    options["domain"] = urlparse(top_level_url).hostname
    if options["domain"] == None:
        # If somehow the top_level_url should be unparseable
        return None

    # Add third-party option if third party. Value doesn't matter.
    if du.get_ps_plus_1(url) != du.get_ps_plus_1(top_level_url):
        options["third-party"] = True
    return options
示例#6
0
def test_get_ps_plus_1_on_vanilla_public_suffix():
    assert get_ps_plus_1('http://www.google.com') == 'google.com'
示例#7
0
def test_when_anchor():
    assert get_ps_plus_1('http://www.google.com#anchor') == 'google.com'
示例#8
0
def test_get_ps_plus_one_on_relative_url():
    assert get_ps_plus_1('/my/path/is.html') == ''
示例#9
0
def test_get_ps_plus_one_on_about_blank():
    result = get_ps_plus_1('about:blank')
    assert result == ''
示例#10
0
def test_get_ps_plus_one_no_https():
    result = get_ps_plus_1('my.domain.cloudfront.net')
    assert result == 'domain.cloudfront.net'
示例#11
0
jsondata = []
for f in files:
    jsonfile = open(f)
    jsonstr = jsonfile.read()
    jsondata.append(json.loads(jsonstr))

finaltld = {}


def parse(string):
    return string.split("/")[2]
    #for other understanding of TLD+1, use the below return function
    #return "".join([str.split("/")[2], "/", str.split("/")[3]])


results = [set() for i in range(4)]
frequencey_dict = {}
for k in range(len(jsondata)):
    for i in range(len(jsondata[k])):
        for j in jsondata[k][i][1]:
            if j.startswith("http"):
                tld = du.get_ps_plus_1(j)
                results[k].add(tld)

intersect = (results[0].intersection(results[1])).intersection(
    results[2]).intersection(results[3])
f = open('intersect.json', 'w')
print(len(intersect))
json.dump(list(intersect), f)
f.close()
示例#12
0
def get_option_dict(request):
    """Build an options dict for BlockListParser

    Parameters
    ----------
    request : sqlite3.Row
        A single HTTP request record pulled from OpenWPM's http_requests table
    public_suffix_list : PublicSuffixList
        An instance of PublicSuffixList()

           BINARY_OPTIONS = [
        "script",
        "image",
        "stylesheet",
        "object",
        "xmlhttprequest",
        "object-subrequest",
        "subdocument",
        "document",
        "elemhide",
        "other",
        "background",
        "xbl",
        "ping",
        "dtd",
        "media",
        "third-party",
        "match-case",
        "collapse",
        "donottrack",
    ]

    Returns
    -------
    dict
        An "options" dictionary for use with BlockListParser
        refs:  
              [1] https://github.com/MoonchildProductions/UXP/blob/master/dom/base/nsIContentPolicyBase.idl
              [2] https://adblockplus.org/en/filters#options
              [3] Englehardt, S., & Narayanan, A. (2016, October). Online tracking: A 1-million-site measurement and analysis. 
                  In Proceedings of the 2016 ACM SIGSAC Conference on Computer and Communications Security (pp. 1388-1401). ACM.
           
    """

    OPTIONS = {
        "other":{               'enabled':False, 'content_policy_type': 1},
        "script":{              'enabled':True, 'content_policy_type': 2},
        "image":{               'enabled':True, 'content_policy_type': 3},
        "stylesheet":{          'enabled':True, 'content_policy_type': 4},
        "object":{              'enabled':True, 'content_policy_type': 5},
        "document":{            'enabled':False, 'content_policy_type': 6},
        "subdocument":{         'enabled':True, 'content_policy_type': 7},
        "xbl":{                 'enabled':False, 'content_policy_type': 9},
        "ping":{                'enabled':False, 'content_policy_type': 10},
        "xmlhttprequest":{      'enabled':True, 'content_policy_type': 11},
        "object-subrequest":{   'enabled':True, 'content_policy_type': 12},
        "dtd":{                 'enabled':False, 'content_policy_type': 13},
        "media":{               'enabled':False, 'content_policy_type': 15},
        "elemhide":{            'enabled':False},
        "background":{          'enabled':False},
        "third-party":{         'enabled':False},
        "match-case":{          'enabled':False},
        "collapse":{            'enabled':False},
        "donottrack":{          'enabled':False},
        "domain":{              'enabled':False},
                    
                    
                    



    }
    options = {}
    
    try: 
        for name in OPTIONS: 
            if OPTIONS[name]['enabled']:
                if name == 'third-party': 
                    options["third-party"] = du.get_ps_plus_1(
                        request['url']) != du.get_ps_plus_1(request['top_level_url'])
                if name == 'domain':
                    options["domain"] = urlparse(request['top_level_url']).hostname
                else: 
                    if 'content_policy_type' in OPTIONS[name]: 
                        options[name] = request['content_policy_type'] == OPTIONS[name]["content_policy_type"]

    except Exception as e: 
        # print("exception {}".format(e))
        pass
    return options
示例#13
0
def test_get_ps_plus_1_on_fbsbx_example():
    # apps.fbsbx.com is on the public sufix list (Apr 2, 2020)
    assert get_ps_plus_1(
        'http://foo.blah.apps.fbsbx.com') == 'blah.apps.fbsbx.com'
    assert get_ps_plus_1('http://foo.blah.www.fbsbx.com') == 'fbsbx.com'
示例#14
0
def fill_forms(email_producer, num_links, page_timeout, debug, visit_id,
               webdriver, proxy_queue, browser_params, manager_params,
               extension_socket, failfile, furl):
    """Finds a newsletter form on the page. If not found, visits <num_links>
    internal links and scans those pages for a form. Submits the form if found.
    """
    # skipping: load the site
    # skipping: connecting to logger

    # try to find a newsletter form on the landing page
    if _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                           browser_params, manager_params):
        return

    # otherwise, scan more pages
    print("couldn't find form, going to click around")
    main_handle = webdriver.current_window_handle
    visited_links = set()
    for i in range(num_links):
        # get all links on the page
        links = webdriver.find_elements_by_tag_name('a')
        random.shuffle(links)

        current_url = webdriver.current_url
        current_ps1 = domain_utils.get_ps_plus_1(current_url)

        # find links to click
        match_links = []
        start_time = timeit.default_timer()
        for link in links:
            try:
                if not link.is_displayed():
                    continue

                # check if link is valid and not already visited
                href = link.get_attribute('href')
                if href is None or href in visited_links:
                    continue

                # check if this is an internal link
                if not _is_internal_link(href, current_url, current_ps1):
                    continue

                link_text = link.text.lower()

                # skip links with blacklisted text
                blacklisted = False
                for bl_text in _LINK_TEXT_BLACKLIST:
                    if bl_text in link_text:
                        blacklisted = True
                        break
                if blacklisted:
                    continue

                # should we click this link?
                link_rank = 0
                for type, s, rank, flags in _LINK_TEXT_RANK:
                    if (type == _TYPE_TEXT
                            and s in link_text) or (type == _TYPE_HREF
                                                    and s in href):
                        if flags & _FLAG_IN_NEW_URL_ONLY:
                            # don't use this link if the current page URL already matches too
                            if type == _TYPE_HREF and s in current_url:
                                continue

                        # link matches!
                        link_rank = rank
                        match_links.append(
                            (link, rank, link_text, href, flags))
                        break
                if link_rank >= _LINK_RANK_SKIP:  # good enough, stop looking
                    break
            except:
                print("ERROR while looping through links...")
                sys.exit(1)

            # quit if too much time passed (for some reason, this is really slow...)
            if match_links and timeit.default_timer(
            ) - start_time > _LINK_MATCH_TIMEOUT:
                break

        # find the best link to click
        if not match_links:
            break  # no more links to click
        match_links.sort(key=lambda l: l[1])
        next_link = match_links[-1]
        visited_links.add(next_link[3])

        # click the link
        try:
            # load the page
            print("clicking on link '%s' - %s" % (next_link[2], next_link[3]))
            next_link[0].click()
            time.sleep(_PAGE_LOAD_TIME)
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)
            # if browser_params['bot_mitigation']:
            #     bot_mitigation(webdriver)

            # find newsletter form
            if _find_and_fill_form(webdriver, email_producer, visit_id, debug,
                                   browser_params, manager_params):
                return

            # should we stay on this page?
            if next_link[4] & _FLAG_STAY_ON_PAGE:
                continue

            # go back
            webdriver.back()
            wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

            # check other windows (ex. pop-ups)
            windows = webdriver.window_handles
            if len(windows) > 1:
                form_found_in_popup = False
                for window in windows:
                    if window != main_handle:
                        webdriver.switch_to_window(window)
                        wait_until_loaded(webdriver, _PAGE_LOAD_TIME)

                        # find newsletter form
                        if _find_and_fill_form(webdriver, email_producer,
                                               visit_id, debug, browser_params,
                                               manager_params):
                            form_found_in_popup = True

                        webdriver.close()
                webdriver.switch_to_window(main_handle)
                time.sleep(1)

                if form_found_in_popup:
                    return
        except:
            pass

    # if you reach here, signup wasn't successful -- save the information
    with open(failfile, 'a') as wh:
        wh.write(furl + '\n')
示例#15
0
def test_get_ps_plus_1_on_exotic_public_suffix():
    assert get_ps_plus_1(
        'http://foo.bar.website.apartments') == 'website.apartments'
示例#16
0
def _is_internal_link(href, url, ps1=None):
    """Returns whether the given link is an internal link."""
    if ps1 is None:
        ps1 = domain_utils.get_ps_plus_1(url)
    return domain_utils.get_ps_plus_1(urljoin(url, href)) == ps1
示例#17
0
if __name__ == '__main__':
    # to switch tabs: driver.switch_to_window(driver.window_handles[0])
    port = sys.argv[1]
    chrome_options = Options()
    chrome_options.debugger_address = "127.0.0.1:" + port
    exe_path = "./chromedriver"
    driver = webdriver.Chrome(executable_path=exe_path, options=chrome_options)

    # config for fill_forms
    # TODO: give large num_links and longer wait time to js in final crawl
    num_links = 3
    page_timeout = 8  # actually not used anywhere
    debug = True
    webdriver = driver
    # email_producer is a function, modify according to use-case
    proxy_queue = None
    browser_params = None
    manager_params = None
    # name of the website is passed here from js to screenshot
    full_url = sys.argv[2]
    # don't want slashes or anything when naming file names
    visit_id = domain_utils.get_ps_plus_1(full_url)
    extension_socket = chrome_options.debugger_address  # actually don't give a f**k about this
    # the file to save the names of all unsuccessful signups
    failfile = 'signup_fails.txt'

    # Here we go...
    fill_forms(email_producer, num_links, page_timeout, debug, visit_id,
               webdriver, proxy_queue, browser_params, manager_params,
               extension_socket, failfile, full_url)
示例#18
0
def test_browser_profile_coverage(default_params, task_manager_creator):
    """Test the coverage of the browser's profile.

    This verifies that Firefox's places.sqlite database contains all
    visited sites. If it does not, it is likely the profile is lost at
    some point during the crawl.
    """
    # Run the test crawl
    manager_params, browser_params = default_params
    manager_params.num_browsers = 1
    manager_params.testing = False
    browser_params[0].profile_archive_dir = (manager_params.data_directory /
                                             "browser_profile")
    browser_params[0].http_instrument = True
    manager, crawl_db = task_manager_creator(
        (manager_params, browser_params[:1]))
    for site in TEST_SITES:
        manager.get(site)
    manager.close()

    # Extract crawl profile
    ff_db_tar = browser_params[0].profile_archive_dir / "profile.tar.gz"
    with tarfile.open(ff_db_tar) as tar:
        tar.extractall(browser_params[0].profile_archive_dir)

    # Output databases
    ff_db = browser_params[0].profile_archive_dir / "places.sqlite"

    # Grab urls from crawl database
    rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests")
    req_ps = set()  # visited domains from http_requests table
    for (url, ) in rows:
        req_ps.add(du.get_ps_plus_1(url))

    hist_ps = set()  # visited domains from crawl_history Table
    rows = db_utils.query_db(
        crawl_db,
        "SELECT arguments FROM crawl_history WHERE command='GetCommand'",
    )
    for (arguments, ) in rows:
        url = json.loads(arguments)["url"]
        ps = du.get_ps_plus_1(url)
        hist_ps.add(ps)

    # Grab urls from Firefox database
    profile_ps = set()  # visited domains from firefox profile
    rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places")
    for (host, ) in rows:
        try:
            profile_ps.add(du.get_ps_plus_1(host))
        except AttributeError:
            pass

    # We expect a url to be in the Firefox profile if:
    # 1. We've made requests to it
    # 2. The url is a top_url we entered into the address bar
    #
    # Previously, we expected some missing urls if the following
    # conditions were not met, but this is no longer the case:
    # 3. The url successfully loaded (see: Issue #40)
    # 4. The site does not respond to the initial request with a 204
    #    (won't show in FF DB)
    # See PR #893 to restore this behavior in case this test fails.
    assert req_ps.intersection(hist_ps).difference(profile_ps) == set()
示例#19
0
    counter += 1
    rows = cur.fetchmany(CHUNKSIZE)
    if len(rows) == 0:
        break
    tp_rows = list()
    if ONLY_SEARCH_IN_THIRD_PARTY_HTTP:
        # This is a major change: we start to look for leaks in the first
        # party requests/responses
        # This was the case for the form filling crawl, we now extend it to
        # other crawls.
        # TODO: Filter 3rd p requests and responses at the analysis stage
        # Having first party leaks is necessary when detecting the origin of
        #  the leaks or "first leaks"
        for row in rows:
            if (row['top_level_url'] is not None and row['top_level_url'] != ''
                    and du.get_ps_plus_1(row['site_url']) != du.get_ps_plus_1(
                        row['url'])  # noqa
                    and du.get_ps_plus_1(row['site_url']) == du.get_ps_plus_1(
                        row['top_level_url'])):  # noqa
                tp_rows.append(row)
        rows = tp_rows

    if where_to_search == "requests":
        results = pool.map(check_row_for_leaks,
                           [(x['url'], x['headers'], x['post_body'])
                            for x in rows])
    else:
        results = pool.map(check_resp_row_for_leaks,
                           [(x['url'], x['headers'], x['location'])
                            for x in rows])
 def contains_ps1(self, hostname):
     """Returns True if the Disconnect list contains any domains from ps1"""
     if not hostname.startswith('http'):
         hostname = 'http://' + hostname
     return du.get_ps_plus_1(hostname) in self._blocklist
示例#21
0
def test_get_ps_plus_1_on_data_url():
    assert get_ps_plus_1(
        "") == ''
def get_internal_links_depth(site, depth):
    """Request and parse internal links from `site`"""
    headers = requests.utils.default_headers()
    headers.update({'User-Agent': USER_AGENT})
    if (depth == 0):
        result = list()
        result.append(site)
        return None, result
    try:
        try:
            if depth == DEPTH:
                resp = requests.get('http://' + site,
                                    headers=headers,
                                    timeout=60)
            else:
                resp = requests.get(site, headers=headers, timeout=60)
        except Exception as e:
            if depth == DEPTH:
                resp = requests.get('http://www.' + site,
                                    headers=headers,
                                    timeout=60)
            else:
                resp = requests.get(site, headers=headers, timeout=60)
        if resp.status_code != 200:
            print("Non-200 response code %i for site %s" %
                  (resp.status_code, site))
            return (site, list())
        if resp.content is None:
            print("No content returned for site %s" % site)
            return (site, list())

        # Current URL after HTTP Redirects
        current_url = resp.url
        top_ps1 = du.get_ps_plus_1(current_url)

        # Find all internal a tags
        soup = BeautifulSoup(resp.content, 'lxml')
        links = set()
        for tag in soup.find_all('a'):
            href = tag.get('href')
            if href is None:
                continue
            href = urlparse.urljoin(current_url, href)

            if (not href.startswith('http')
                    or du.get_ps_plus_1(href) == top_ps1):
                #if (not href.startswith('http')):
                continue
            links.add(urlparse.urldefrag(href)[0])

        # Craw Next Level
        links_next_layer = set()
        for link in links:
            links_next_layer |= set(
                get_internal_links_depth(link, depth - 1)[1])
        links |= links_next_layer
        return site, list(links)
    except (KeyboardInterrupt, SystemExit):
        raise
    except Exception as e:
        print("Exception while requesting %s\n%s" % (site, str(e)))
        return (site, list())
示例#23
0
def test_get_ps_plus_1_on_ip_addresses():
    assert get_ps_plus_1('http://192.168.1.1') == '192.168.1.1'
    assert get_ps_plus_1('http://127.0.0.1/foo.html') == '127.0.0.1'
示例#24
0
def get_set_of_script_ps1s_from_call_stack(script_urls):
    if len(script_urls):
        return ", ".join(
            set((du.get_ps_plus_1(x) or "") for x in script_urls.split(", ")))
    else:
        return ""