def extract_information(self): third_parties = { 'fqdns': set(), 'num_http_requests': 0, 'num_https_requests': 0 } first_party_domains = set() for url in (self.result['site_url'], self.result['final_url']): extracted = parse_domain(url) first_party_domains.add(extracted.registered_domain) for request in self.page.request_log: request['is_thirdparty'] = False extracted_url = parse_domain(request['url']) parsed_url = request['parsed_url'] if extracted_url.registered_domain in first_party_domains: continue if request['url'].startswith('data:'): continue request['is_thirdparty'] = True third_parties['fqdns'].add(extracted_url.fqdn) if parsed_url.scheme not in ('http', 'https'): continue third_parties['num_{}_requests'.format(parsed_url.scheme)] += 1 third_parties['fqdns'] = list(third_parties['fqdns']) third_parties['fqdns'].sort() self.result['third_parties'] = third_parties for cookie in self.result['cookies']: domain = cookie['domain'] if domain.startswith('.'): domain = domain[1:] domain = parse_domain(domain).registered_domain cookie['is_thirdparty'] = domain not in first_party_domains
def extract_information(self): self._load_rules() trackers_fqdn = set() trackers_domain = set() num_tracker_requests = 0 blacklist = set() num_evaluations = 0 for request in self.page.request_log: request['is_tracker'] = False if not request['is_thirdparty'] or request['url'].startswith( 'data:'): continue is_tracker = request['parsed_url'].netloc in blacklist if not is_tracker: # Giving only the first 150 characters of an URL is # sufficient to get good matches, so this will speed # up checking quite a bit! match_result = self.rules.match(request['url'][:150], request['document_url']) is_tracker = match_result.is_match num_evaluations += 1 if is_tracker: request['is_tracker'] = True extracted = parse_domain(request['url']) if extracted.fqdn: trackers_fqdn.add(extracted.fqdn) trackers_domain.add(extracted.registered_domain) num_tracker_requests += 1 blacklist.add(request['parsed_url'].netloc) num_tracker_cookies = 0 for cookie in self.result['cookies']: is_tracker = False domain = cookie['domain'] if domain in trackers_fqdn or domain in trackers_domain: is_tracker = True elif domain.startswith('.'): reg_domain = parse_domain(domain[1:]).registered_domain if reg_domain in trackers_domain: is_tracker = True if is_tracker: num_tracker_cookies += 1 cookie['is_tracker'] = is_tracker self.result['tracking'] = { 'trackers': list(sorted(trackers_fqdn)), 'num_tracker_requests': num_tracker_requests, 'num_tracker_cookies': num_tracker_cookies }
def extract_information(self): requests_lookup = {request['requestId']: request for request in self.page.request_log} failed_requests = [] for failed_request in self.page.failed_request_log: error_text = failed_request['errorText'] valid_errors = ('net::ERR_CACHE_MISS', 'net::ERR_ABORTED') if any(error in error_text for error in valid_errors): # Requests that were aborted by the site (e.g. a XHR # request that was canceled) and cache misses are # not considered failed. continue extra = None try: request = requests_lookup[failed_request['requestId']] except KeyError: self.logger.error('Could not find request: {}'.format(failed_request)) continue if 'net::ERR_NAME_NOT_RESOLVED' in error_text: error_type = 'dns-not-resolved' # We could not resolve the IP address of this host. One # reason might be, that the domain is not registered. # To check whether this is the case, we check for the # absence of a SOA record for the domain itself, i.e., # not the netloc of the URL. Unregistered domains # should have no SOA entry, while registered should. domain = parse_domain(request['url']).registered_domain try: dns.resolver.query(domain, 'SOA') domain_registered = True # If we have a timeout, we better don't say anything about # this domain rather than giving a wrong impressing wether # the domain is registered or net except dns.resolver.Timeout: domain_registered = None # Nameservers behave weird, if the domain is not registered. # Some send NXDOMAIN as expected, others prefer to give an # answer but do not include a SOA entry in the response. # Sometimes all nameservers do not like to answer if the # domain is not registered. It is a real mess. except (dns.resolver.NXDOMAIN, dns.resolver.NoNameservers, dns.resolver.NoAnswer): domain_registered = False extra = {'domain_registered': domain_registered} elif 'net::ERR_UNKNOWN_URL_SCHEME' in error_text: error_type = 'unknown-url-scheme' else: error_type = 'unknown' error = { 'url': request['url'], 'error_type': error_type, } if extra is not None: error.update(extra) if error_type == 'unknown': error['error_text'] = error_text failed_requests.append(error) self.result['failed_requests'] = failed_requests
def extract_information(self): stats = {} for party in ('first', 'third'): for duration in ('short', 'long'): stats['{}_party_{}'.format(party, duration)] = 0 cookietrackers = set() for cookie in self.result['cookies']: prefix = 'third' if cookie['is_thirdparty'] else 'first' suffix = 'long' if cookie[ 'lifetime'] > self.long_cookie_time else 'short' stats['{}_party_{}'.format(prefix, suffix)] += 1 if cookie['is_tracker']: tracker = parse_domain(cookie['domain']) cookietrackers.add(tracker.registered_domain) stats['trackers'] = list(sorted(cookietrackers)) self.result['cookiestats'] = stats
def extract_information(self): global _hsts_lookup hsts_preload = {'is_ready': False, 'is_preloaded': False} self.result['https']['hsts_preload'] = hsts_preload self.result.mark_dirty('https') if _hsts_lookup is None: lookup_file = self.options['storage_path'] / 'hsts.json' with lookup_file.open() as f: _hsts_lookup = json.load(f) domain = parse_domain(self.result['final_url']).registered_domain is_preloaded = domain in _hsts_lookup # Iterate over all subdomains and check if any of it is preloaded. # We have to handle three cases: # 1) all subdomains are not in the preload list. Reject. # 2) a subdomain is in the preload list, and include_subdomains is set. # Accept in this case. # 3) a subdomain is in the preload list, but include_subdomains # is not set. Then we have to do two things. Firstly, continue # searching: maybe another subdomain of the current subdomain # is in the list and has include_subdomains. See case 2. Secondly, # the full domain might be in the lookup. This has already been # checked beforehand, so nothing to do. current_domain = '' for part in domain.split('.'): current_domain = part + '.' + current_domain if current_domain in _hsts_lookup: include_subdomains = _hsts_lookup[current_domain] if include_subdomains: is_preloaded = True break else: break hsts_header = self.result['security_headers'][ 'Strict-Transport-Security'] if hsts_header is None: return # There are some big players who got exceptions from the standard # requirements to be HSTS ready, therefore we treat them as HSTS ready # if they are already in the preload list. However, we require the HSTS # header to be set (see return statement above). hsts_preload['is_preloaded'] = is_preloaded if is_preloaded: hsts_preload['is_ready'] = True return # According to hstspreload.org, these are the criteria for being ready # to be included in the HSTS preload list: # # 1. Serve a valid certificate. # 2. Redirect from HTTP to HTTPS on the same host, if you are listening # on port 80. # 3. Serve all subdomains over HTTPS. # In particular, you must support HTTPS for the www subdomain if a # DNS record for that subdomain exists. # 4. Serve an HSTS header on the base domain for HTTPS requests: # 4.1 The max-age must be at least 31536000 seconds (1 year). # 4.2 The includeSubDomains directive must be specified. # 4.3 The preload directive must be specified. # 4.4 If you are serving an additional redirect from your HTTPS site, # that redirect must still have the HSTS header (rather than the # page it redirects to). fail_reasons = [] if not self.result['final_url'].startswith('https://'): fail_reasons.append('no-https-redirect') if not hsts_header['includeSubDomains']: fail_reasons.append('no-include-subdomains') if hsts_header['max-age'] is None: fail_reasons.append('no-max-age') elif hsts_header['max-age'] < 31536000: fail_reasons.append('max-age-too-short') if not hsts_header['preload']: fail_reasons.append('missing-preload') fail_reasons.sort() hsts_preload['is_ready'] = len(fail_reasons) == 0 if fail_reasons: hsts_preload['fail_reasons'] = fail_reasons