def test_rules_supported_options(): rules = AdblockRules(["adv", "@@advice.$~script"]) assert not rules.should_block("http://example.com/advice.html", {'script': False}) # exception rule should be discarded if "script" option is not supported rules2 = AdblockRules(["adv", "@@advice.$~script"], supported_options=[]) assert rules2.should_block("http://example.com/advice.html", {'script': False})
def test_rule_exceptions(rules, results, use_re2): rules = AdblockRules(rules, use_re2=use_re2) for url in results["blocks"]: assert rules.should_block(url) for url in results["doesn't block"]: assert not rules.should_block(url)
def test_documented_examples(rule_text, results, use_re2): rule = AdblockRule(rule_text) rules = AdblockRules([rule_text], use_re2=use_re2) for url in results["blocks"]: assert rule.match_url(url) assert rules.should_block(url) for url in results["doesn't block"]: assert not rule.match_url(url) assert not rules.should_block(url)
def main(harfile_path, domain, blockfile): harfile = open(harfile_path, "r") harfile_json = json.loads(harfile.read()) harfile.close() bfile = open(blockfile, "r") block_list = bfile.readlines() bfile.close() rules = AdblockRules(block_list) adblock_db = { "url_data": {}, "stats": { "domain": domain, "req": 0, "succ": 0, "block": 0 } } options = ('image', 'xmlhttprequest', 'document', 'font', 'script', 'stylesheet', 'other') for entry in harfile_json['log']['entries']: url = entry['request']['url'] urlparts = urlparse(url) print("Processing {} ...".format(url)) try: fld = get_fld(url, fail_silently=True) adblock_db["stats"]["req"] += 1 if fld != domain: d = {} if entry["_resourceType"] == "xhr": entry["_resourceType"] = "xmlhttprequest" if entry["_resourceType"] not in options: d = {"third-party": True, "domain": urlparts.hostname} else: d = {entry["_resourceType"]: True, "third-party": True, "domain": urlparts.hostname} if rules.should_block(url, d): adblock_db["stats"]["block"] += 1 else: adblock_db["stats"]["succ"] += 1 else: if entry["_resourceType"] == "xhr": entry["_resourceType"] = "xmlhttprequest" if entry["_resourceType"] not in options: d = {"third-party": False, "domain": urlparts.hostname} else: d = {entry["_resourceType"]: True, "third-party": False, "domain": urlparts.hostname} if rules.should_block(url, d): adblock_db["stats"]["block"] += 1 else: adblock_db["stats"]["succ"] += 1 except: continue return adblock_db
def test_rule_with_options(rule_text, results, use_re2): rule = AdblockRule(rule_text) rules = AdblockRules([rule_text], use_re2=use_re2) for url, params, match in results: assert rule.match_url(url, params) == match assert rules.should_block(url, params) == match
class Filter(object): def __init__(self, filename): self.rules = [] with open(filename, "r") as blacklist: for line in blacklist.xreadlines(): if line.startswith('!'): continue if '##' in line: # HTML rule continue else: self.rules.append(line) #, supported_options=['script', 'domain'] self.adblock = AdblockRules(self.rules, supported_options=['script', 'domain']) def match(self, url, options=None): return self.adblock.should_block(url)
def main(harfile_path): """Reads a har file from the filesystem, converts to CSV, then dumps to stdout. """ txt_file = 'easylist.txt' raw_rules = readfile(txt_file) harfile = open(harfile_path, encoding = 'UTF-8') harfile_json = json.loads(harfile.read()) i = 0 first_party = harfile_path.split('.')[1]+'.'+harfile_path.split('.')[2] rules = AdblockRules(raw_rules) blocked = 0 blocked_domains = set() opt = {'script': True,'image':True,'stylesheet':True,'object':True,'subdocument':True,'xmlhttprequest':True,'websocket':True,'webrtc':True,'popup':True,'generichide':True,'genericblock':True} for entry in harfile_json['log']['entries']: i = i + 1 url = entry['request']['url'] urlparts = urlparse(entry['request']['url']) size_bytes = entry['response']['bodySize'] size_kilobytes = float(entry['response']['bodySize'])/1024 mimetype = 'unknown' if 'mimeType' in entry['response']['content']: mimetype = entry['response']['content']['mimeType'] option = '' res = get_tld(url, as_object=True) mime_opt = mimetype.split('/')[0] if mime_opt in opt: option = mime_opt if res.fld != first_party and option in opt and rules.should_block(url, {option: opt[option]}): blocked += 1 blocked_domains.add(res.fld) blocked_domains = [dom for dom in blocked_domains] if blocked_domains else 'No domains blocked' print(f'\nSite: {first_party}\n# of total HTTP requests: {i}\n# of HTTP requests blocked: {blocked}\nBlocked domains: {blocked_domains}\n')
def test_regex_rules(): # Regex rules are not supported yet. # There are no such rules in EasyList filters. rules = AdblockRules(["/banner\d+/"]) assert rules.should_block("banner123") assert not rules.should_block("banners")
"""Check each path against filter for maliciousness, ads, tracking, ads and tracking, bitcoin, or pornography. (number of n at the end file determines the version for the file. The more the number of n, the newer the version of the file) """ for filt in filterlist[1:]: print('Start checking against: ' + filt[0] + ': ' + filt[1]) if 'host' not in filt[1]: try: req = Request(filt[1], headers={'User-Agent': 'Mozilla/5.0'}) raw_rules = urlopen(req).readlines() raw_rules2 = [x.decode('utf8') for x in raw_rules if x.decode('utf8') != '\r\n'] raw_rules3 = [] for raw in raw_rules2: raw_rules3.append(raw.replace('\n', '').replace('\r', '')) rules = AdblockRules(raw_rules3) except KeyboardInterrupt: raise except: print('====cannot read filter====') raw_rules3 = '' if raw_rules3 != '': #print(raw_rules3) for path in array: if rules.should_block(path) is True: print(path + ' : Yes') dictionary[path][filt[2]] = True else: print(path + ' : No') print('---------------------------------') jsonify.export_json('filterChecked_nn.json', dictionary)
import adblockparser, sys from adblockparser import AdblockRules f = open('easylist.txt', 'r') all = f.read().splitlines() rules = AdblockRules(all) while True: # sys.stdin.open() s = sys.stdin.readline() # print sys.argv[1] print rules.should_block(s) # sys.stdin.close() sys.stdout.close() sys.stdout = open('/dev/stdout', 'w') # open(sys.stdout) # open(sys.stdin) # print "^D"
def test_rules_instantiation(): rule = AdblockRule("adv") rules = AdblockRules([rule]) assert rule.match_url("http://example.com/adv") assert rules.should_block("http://example.com/adv")
df = pd.read_csv(csvfile) domains = df[df.columns[1]] mime = df[df.columns[5]] for domain, m in zip(domains, mime): options = {} total += 1 if (m.find('image')): options['image'] = True else: options['image'] = False if (m.find('javascript')): options['script'] = True else: options['script'] = False if rules.should_block(domain, options) == True: res = get_tld(domain, as_object=True) if (res.fld.find(site) == -1): if res.fld not in l: l.append(res.fld.encode('utf-8')) blocked += 1 else: unblocked += 1 strdom = '\n'.join(l) x.add_row([site, total, blocked, strdom]) print(x)
def detect_trackers(third_parties): """ Detect 3rd party trackers and return a list of them. :param third_parties: List of third-party requests (not: hosts) to analyze :return: a list of unique hosts in the form domain.tld """ if len(third_parties) == 0: return [] blacklist = [re.compile('^[\|]*http[s]*[:/]*$'), # match http[s]:// in all variations re.compile('^[\|]*ws[:/]*$'), # match ws:// in all variations re.compile('^\.'), # match rules like .com re.compile('^\/'), # match rules like /stuff re.compile('^\#'), # match rules beginning with # re.compile('^\:'), # match rules beginning with : re.compile('^\?'), # match rules beginning with ? ] def is_acceptable_rule(rule): if '@' in rule: return False for exp in blacklist: if exp.match(rule) is not None: return False return True lines = [] rules = [] result = [] start_time = timeit.default_timer() # Generate paths to files easylist_path = os.path.join( settings.SCAN_TEST_BASEPATH, 'vendor/EasyList', 'easylist.txt') easyprivacy_path = os.path.join( settings.SCAN_TEST_BASEPATH, 'vendor/EasyList', 'easyprivacy.txt') fanboy_path = os.path.join( settings.SCAN_TEST_BASEPATH, 'vendor/EasyList', 'fanboy-annoyance.txt') # Read in files: for line in open(easylist_path, 'r', encoding="utf-8"): lines.append(line) for line in open(easyprivacy_path, 'r', encoding="utf-8"): lines.append(line) for line in open(fanboy_path, 'r', encoding="utf-8"): lines.append(line) # Clean up lines: for line in lines: try: rule = line.split('$')[0] if is_acceptable_rule(rule): rules.append(rule) except: print("Unexpected error:", sys.exc_info()[0]) abr = AdblockRules(rules) elapsed = timeit.default_timer() - start_time print("Elapsed: %i secs" % elapsed) i = 0 for url in third_parties: if abr.should_block(url): ext = tldextract.extract(url) result.append("{}.{}".format(ext.domain, ext.suffix)) i = i + 1 if i % 20 == 0: elapsed = timeit.default_timer() - start_time print("Checked %i domains, %i secs elapsed..." % (i, elapsed)) return list(set(result))
class AdBlockUnit(browser_unit.BrowserUnit): EASYLIST = 'easylist.txt' EASYLIST_URL = "https://easylist-downloads.adblockplus.org/easylist.txt" def _easylist_version(self, path=EASYLIST): ''' Reads the version from the current easylist, or a file that is passed in ''' if os.path.isfile(path): with open(path) as f: lines = f.read().splitlines() return lines[2].split(':')[1].strip() else: return -1 def _fetch_easylist(self): ''' Downloads the latest version of easylist, and if newer replaces any existing one. ''' tmp_easylist = "tmp_" + self.EASYLIST cur_version = self._easylist_version() # download latest easylist from the Internet urllib.request.urlretrieve(self.EASYLIST_URL, tmp_easylist) tmp_version = self._easylist_version(path=tmp_easylist) # if necessary update if tmp_version > cur_version and cur_version != -1: os.remove(self.EASYLIST) shutil.move(tmp_easylist, self.EASYLIST) print(("Updated easylist from {} to {}".format( cur_version, tmp_version))) elif cur_version == -1: shutil.move(tmp_easylist, self.EASYLIST) print(("New easylist {}".format(tmp_version))) else: os.remove(tmp_easylist) print(("Easylist already up to date at: {}".format(tmp_version))) def _load_easylist(self): ''' Reads in easylist from a file and parses it into lines to be passed to abblockparser. ''' with open(self.EASYLIST) as f: lines = f.read().splitlines() print(("Loaded easylist version: {} with : {} items".format( self._easylist_version(), len(lines)))) return lines def __init__(self, browser="firefox", log_file="log.txt", unit_id=0, treatment_id=0, headless=False, proxy=None, rules=None): # if easylist is not passed in, then consider this is a bare unit that # that should only be used to fetch easylist and then parse into # adblockplus rules for use with adblockparser. if rules == None: self._fetch_easylist() self.filterlist = self._load_easylist() self.rules = AdblockRules(self.filterlist) else: logging.basicConfig(filename="adb_" + log_file, level=logging.INFO) self.logger = logging.getLogger(__name__) # call parent constructor browser_unit.BrowserUnit.__init__(self, browser, log_file, unit_id, treatment_id, headless, proxy=proxy) self.session = self.driver.session_id print(("Running adblock unit session: {}".format(self.session))) # set rules to those that where passed in self.rules = rules self.all_options = { opt: True for opt in AdblockRule.BINARY_OPTIONS } # internal ad data structure self.data = [] self.Ad = namedtuple('Ad', [ 'url', 'outerhtml', 'tag', 'link_text', 'link_location', 'on_site', 'reloads' ]) # dictionary to memoize url checks self.memo = {} # store current context where we are collecting ads self.site = "" self.reloads = 0 def save_data(self): json_file = os.path.splitext( self.log_file)[0] + "." + self.session + ".json" with open(json_file, 'w') as outfile: json.dump(self.data, outfile) # This is the log line adblock_analysis will parse to identify data files self.logger.info("save_data:{}:{}:{}".format(self.unit_id, self.treatment_id, self.session)) def log_element(self, element, source): ''' Input: An element that has been identified as an ad and how it was identified Result: Inserts appropriate information into the log ''' url = element.get_attribute(source) html = element.get_attribute('outerHTML').encode('utf-8') tag = element.tag_name link_text = element.text link_location = element.location # update internal datastore ad_data = self.Ad(url=url, outerhtml=html, tag=tag, link_text=link_text, link_location=link_location, on_site=self.site, reloads=self.reloads) # store to internal data structure self.data.append(ad_data) # log to plaintext log self.logger.debug("Ad:Data:{}".format(ad_data)) def check_elements(self, elements, source, options=None): ''' Input: Given an element in the currently active page and an attribute to query on Result: Queries the given attribute (source) and checks the url against the filterlist. Logs any identified elements and returns the count. ''' count = 0 for e in elements: try: url = e.get_attribute(source) if url != None: self.logger.debug("Checking:{}:{}".format(source, url)) # check if we have evaluated this ad before if url not in self.memo: # actually check the url against the filter list self.memo[url] = self.rules.should_block(url, options) if self.memo[url]: self.log_element(e, source) count += 1 # occurs with stale elements that no longer exist in the DOM except selenium.common.exceptions.StaleElementReferenceException as e: self.logger.error(e) return count def check_href(self): ''' Identifies and captures ads based on HTML hyperlink tags. These are considered "text" ads. ''' driver = self.driver ### xpath could be less performant than other find_* methods # common tags: <a>,<link> elements = driver.find_elements_by_xpath("//*[@href]") count = self.check_elements(elements, "href", self.all_options) self.logger.debug("href search found: {}".format(count)) def check_src(self): ''' Identifies and captures ads based on tags with a 'src' attribute These are considered "media" ads and are often img, iframe,script tags ''' driver = self.driver ### xpath could be less performant than other find_* methods # common tags: <img>, <iframe>, <frame>, <embed>, <script> elements = driver.find_elements_by_xpath("//*[@src]") count = self.check_elements(elements, "src", self.all_options) self.logger.debug("src search found: {}".format(count)) def check_iframe(self, parents=()): ''' Functionality to check within nested iframes for ad related resources. Invariants: expects webdriver to enter at the level defined by parents resets webdriver to top level contents prior to leaving Input: a tuple describing the iframe name attribute of parent levels ''' driver = self.driver children = driver.find_elements_by_tag_name('iframe') for child in children: try: driver.switch_to.frame(child) # check in the iframe for ads self.check_href() self.check_src() # set parent for children we check nesting = parents + (child, ) self.check_iframe(parents=nesting) except selenium.common.exceptions.StaleElementReferenceException as e: self.logger.error(e) # return to correct level of nesting driver.switch_to_default_content() for p in parents: try: driver.switch_to.frame(p) except selenium.common.exceptions.NoSuchElementException as e: # this should not occur but just in case, preserve invariant # of function leaving at top level self.logger.error("resetting level in iframe recursion") driver.switch_to_default_content() # always reset to top level content prior to exiting driver.switch_to_default_content() def find_ads(self): ''' Primary convenience function to use all ad identification mechanisms ''' self.check_href() self.check_src() self.check_iframe() def visit_url(self, url): driver = self.driver try: driver.get(url) self.logger.debug("Visited: {}".format(url)) self.site = url return True except selenium.common.exceptions.TimeoutException as e: print(("Timeout Visiting: {} : {}".format(url, self.session))) print(e) return False def collect_ads(self, url, reloads=1, delay=0, file_name=None): ''' Visits a specified url and runs ad collection functions Result: ''' print(("collecting ads on: {}".format(url))) if file_name == None: file_name = self.log_file # number of reloads on site to capture all ads for r in range(reloads): time.sleep(delay) # if a successful visit if self.visit_url(url): # collect ads self.reloads = r self.find_ads()
def process(self, instance, parameters=None, commit=True, **kwargs): """ See source code. """ instance_name = instance._meta.verbose_name instance_id = instance.id urls = URL_MATCH_REGEX.findall(instance.content) if not urls: LOGGER.info(u'url-crawler: nothing to crawl in %s %s.', instance_name, instance_id) return # Start with EasyList adblock_rules_list = requests_get( # WARNING: do not .split() with no parameters, else # adblock will block everything due to empty rules. 'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n') # Append our eventual specific exclusions adblock_rules_list.extend( parameters.get('integration', {}).get('fetch_content_urls', {}).get('adblock_rules', [])) if re2 is None: # Things will be dogly slow… adblock_rules = AdblockRules( adblock_rules_list, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) else: # Things will go faster adblock_rules = AdblockRules( adblock_rules_list, use_re2=True, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) if isinstance(instance, models.Email): origin = models.ORIGINS.EMAIL # NOTE: there will be at least one here, else # accepts() would have rejected the email. feeds = instance.feeds.exclude( MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE) else: origin = models.ORIGINS.CRAWLING feeds = instance.feeds.all() dupes = 0 blocked = 0 # LOGGER.debug('URLS: %s %s', len(urls), urls) for url in urls: if url.startswith('('): url = url[1:] if url.endswith(')'): # Skip Markdown's enclosing parenthesis # that we explicitely matched manually. url = url[:-1] # In case we've got garbage at the end of the RE. splitted = url.split(')') if len(splitted) == 1: pass if len(splitted) == 2 and len(splitted[1]) < 4: # Highly probable that we got some garbage at the end. url = splitted[0] else: LOGGER.error( u'url-crawler: probable nasty unhandled ' u'URL “%s” too-greedily matched by RE.', url) if adblock_rules.should_block(url): LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.', url) blocked += 1 continue LOGGER.info('url-crawler: importing from %s.', url) try: item, created = create_item_from_url( url=clean_url(url), feeds=feeds, origin=origin, ) except: LOGGER.exception(u'Could not create item from URL “%s”', url) else: if created: LOGGER.info( u'url-crawler: successfully imported %s from ' u'%s %s.', item, instance_name, instance_id) else: dupes += 1 LOGGER.warning(u'url-crawler: %s already in database.', item) # link newly created item to the item it was found into. item.sources.add(instance) LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.', len(urls) - blocked, len(urls) - blocked - dupes, instance_name, instance_id)
class AdBlockUnit(browser_unit.BrowserUnit): EASYLIST = 'easylist.txt' EASYLIST_URL = "https://easylist-downloads.adblockplus.org/easylist.txt" def _easylist_version(self,path=EASYLIST): ''' Reads the version from the current easylist, or a file that is passed in ''' if os.path.isfile(path): with open(path) as f: lines = f.read().splitlines() return lines[2].split(':')[1].strip() else: return -1 def _fetch_easylist(self): ''' Downloads the latest version of easylist, and if newer replaces any existing one. ''' tmp_easylist = "tmp_"+self.EASYLIST cur_version = self._easylist_version() # download latest easylist from the Internet urllib.urlretrieve(self.EASYLIST_URL,tmp_easylist) tmp_version = self._easylist_version(path=tmp_easylist) # if necessary update if tmp_version > cur_version and cur_version != -1: os.remove(self.EASYLIST) shutil.move(tmp_easylist,self.EASYLIST) print ("Updated easylist from {} to {}".format(cur_version,tmp_version)) elif cur_version == -1: shutil.move(tmp_easylist,self.EASYLIST) print("New easylist {}".format(tmp_version)) else: os.remove(tmp_easylist) print("Easylist already up to date at: {}".format(tmp_version)) def _load_easylist(self): ''' Reads in easylist from a file and parses it into lines to be passed to abblockparser. ''' with open(self.EASYLIST) as f: lines = f.read().splitlines() print("Loaded easylist version: {} with : {} items".format(self._easylist_version(),len(lines))) return lines def __init__(self, browser="firefox", log_file="log.txt", unit_id=0, treatment_id=0, headless=False, proxy=None,rules=None): # if easylist is not passed in, then consider this is a bare unit that # that should only be used to fetch easylist and then parse into # adblockplus rules for use with adblockparser. if rules == None: self._fetch_easylist() self.filterlist = self._load_easylist() self.rules = AdblockRules(self.filterlist) else: logging.basicConfig(filename="adb_"+log_file,level=logging.INFO) self.logger = logging.getLogger(__name__) # call parent constructor browser_unit.BrowserUnit.__init__(self, browser, log_file, unit_id, treatment_id, headless, proxy=proxy) self.session = self.driver.session_id print("Running adblock unit session: {}".format(self.session)) # set rules to those that where passed in self.rules = rules self.all_options = {opt:True for opt in AdblockRule.BINARY_OPTIONS} # internal ad data structure self.data = [] self.Ad = namedtuple('Ad',['url','outerhtml','tag','link_text','link_location','on_site', 'reloads']) # dictionary to memoize url checks self.memo = {} # store current context where we are collecting ads self.site = "" self.reloads= 0 def save_data(self): json_file = os.path.splitext(self.log_file)[0]+"."+self.session+".json" with open(json_file, 'w') as outfile: json.dump(self.data, outfile) # This is the log line adblock_analysis will parse to identify data files self.logger.info("save_data:{}:{}:{}".format(self.unit_id,self.treatment_id,self.session)) def log_element(self,element,source): ''' Input: An element that has been identified as an ad and how it was identified Result: Inserts appropriate information into the log ''' url = element.get_attribute(source) html = element.get_attribute('outerHTML').encode('utf-8') tag = element.tag_name link_text = element.text link_location = element.location # update internal datastore ad_data = self.Ad(url=url, outerhtml=html, tag=tag, link_text=link_text, link_location=link_location, on_site=self.site, reloads=self.reloads) # store to internal data structure self.data.append(ad_data) # log to plaintext log self.logger.debug("Ad:Data:{}".format(ad_data)) def check_elements(self, elements, source, options=None): ''' Input: Given an element in the currently active page and an attribute to query on Result: Queries the given attribute (source) and checks the url against the filterlist. Logs any identified elements and returns the count. ''' count = 0 for e in elements: try: url = e.get_attribute(source) if url != None: self.logger.debug("Checking:{}:{}".format(source, url)) # check if we have evaluated this ad before if url not in self.memo: # actually check the url against the filter list self.memo[url] = self.rules.should_block(url, options) if self.memo[url]: self.log_element(e,source) count+=1 # occurs with stale elements that no longer exist in the DOM except selenium.common.exceptions.StaleElementReferenceException as e: self.logger.error(e) return count def check_href(self): ''' Identifies and captures ads based on HTML hyperlink tags. These are considered "text" ads. ''' driver = self.driver ### xpath could be less performant than other find_* methods # common tags: <a>,<link> elements = driver.find_elements_by_xpath("//*[@href]") count = self.check_elements(elements,"href", self.all_options) self.logger.debug("href search found: {}".format(count)) def check_src(self): ''' Identifies and captures ads based on tags with a 'src' attribute These are considered "media" ads and are often img, iframe,script tags ''' driver = self.driver ### xpath could be less performant than other find_* methods # common tags: <img>, <iframe>, <frame>, <embed>, <script> elements = driver.find_elements_by_xpath("//*[@src]") count = self.check_elements(elements, "src", self.all_options) self.logger.debug("src search found: {}".format(count)) def check_iframe(self,parents=()): ''' Functionality to check within nested iframes for ad related resources. Invariants: expects webdriver to enter at the level defined by parents resets webdriver to top level contents prior to leaving Input: a tuple describing the iframe name attribute of parent levels ''' driver = self.driver children = driver.find_elements_by_tag_name('iframe') for child in children: try: driver.switch_to.frame(child) # check in the iframe for ads self.check_href() self.check_src() # set parent for children we check nesting = parents + (child,) self.check_iframe(parents=nesting) except selenium.common.exceptions.StaleElementReferenceException as e: self.logger.error(e) # return to correct level of nesting driver.switch_to_default_content() for p in parents: try: driver.switch_to.frame(p) except selenium.common.exceptions.NoSuchElementException as e: # this should not occur but just in case, preserve invariant # of function leaving at top level self.logger.error("resetting level in iframe recursion") driver.switch_to_default_content() # always reset to top level content prior to exiting driver.switch_to_default_content() def find_ads(self): ''' Primary convenience function to use all ad identification mechanisms ''' self.check_href() self.check_src() self.check_iframe() def visit_url(self,url): driver = self.driver try: driver.get(url) self.logger.debug("Visited: {}".format(url)) self.site = url return True except selenium.common.exceptions.TimeoutException as e: print("Timeout Visiting: {} : {}".format(url,self.session)) print e return False def collect_ads(self,url, reloads=1, delay=0, file_name=None): ''' Visits a specified url and runs ad collection functions Result: ''' print("collecting ads on: {}".format(url)) if file_name == None: file_name = self.log_file # number of reloads on site to capture all ads for r in range(reloads): time.sleep(delay) # if a successful visit if self.visit_url(url): # collect ads self.reloads=r self.find_ads()
def test_regex_rules(): rules = AdblockRules(["/banner\d+/"]) assert rules.should_block("banner123") assert not rules.should_block("banners")
def process(self, instance, parameters=None, commit=True, **kwargs): """ See source code. """ instance_name = instance._meta.verbose_name instance_id = instance.id urls = URL_MATCH_REGEX.findall(instance.content) if not urls: LOGGER.info(u'url-crawler: nothing to crawl in %s %s.', instance_name, instance_id) return # Start with EasyList adblock_rules_list = requests_get( # WARNING: do not .split() with no parameters, else # adblock will block everything due to empty rules. 'https://easylist-downloads.adblockplus.org/easylist.txt').split('\n') # Append our eventual specific exclusions adblock_rules_list.extend( parameters.get( 'integration', {}).get( 'fetch_content_urls', {}).get( 'adblock_rules', [])) if re2 is None: # Things will be dogly slow… adblock_rules = AdblockRules( adblock_rules_list, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) else: # Things will go faster adblock_rules = AdblockRules( adblock_rules_list, use_re2=True, max_mem=config.PROCESSING_ADBLOCK_MAXIMUM_MEMORY) if isinstance(instance, models.Email): origin = models.ORIGINS.EMAIL # NOTE: there will be at least one here, else # accepts() would have rejected the email. feeds = instance.feeds.exclude( MailFeed___match_action=MAIL_MATCH_ACTIONS.STORE) else: origin = models.ORIGINS.CRAWLING feeds = instance.feeds.all() dupes = 0 blocked = 0 # LOGGER.debug('URLS: %s %s', len(urls), urls) for url in urls: if url.startswith('('): url = url[1:] if url.endswith(')'): # Skip Markdown's enclosing parenthesis # that we explicitely matched manually. url = url[:-1] # In case we've got garbage at the end of the RE. splitted = url.split(')') if len(splitted) == 1: pass if len(splitted) == 2 and len(splitted[1]) < 4: # Highly probable that we got some garbage at the end. url = splitted[0] else: LOGGER.error(u'url-crawler: probable nasty unhandled ' u'URL “%s” too-greedily matched by RE.', url) if adblock_rules.should_block(url): LOGGER.info(u'url-crawler: URL %s skipped, in adblocked rules.', url) blocked += 1 continue LOGGER.info('url-crawler: importing from %s.', url) try: item, created = create_item_from_url( url=clean_url(url), feeds=feeds, origin=origin, ) except: LOGGER.exception(u'Could not create item from URL “%s”', url) else: if created: LOGGER.info(u'url-crawler: successfully imported %s from ' u'%s %s.', item, instance_name, instance_id) else: dupes += 1 LOGGER.warning(u'url-crawler: %s already in database.', item) # link newly created item to the item it was found into. item.sources.add(instance) LOGGER.info(u'url-crawler: crawled %s items (%s new) from %s %s.', len(urls) - blocked, len(urls) - blocked - dupes, instance_name, instance_id)
def check_if_ad(url): with open('easylist.txt', 'r') as file: raw_rules = file.readlines() rules = AdblockRules(raw_rules) return rules.should_block(url)
def test_rules_with_options(rules, results, use_re2): rules = AdblockRules(rules, use_re2=use_re2) for url, params, should_block in results: assert rules.should_block(url, params) == should_block
mimietype=entry['response']['content'] if mimietype.get('mimeType')!=None: mimt[url]=mimietype.get('mimeType') if __name__ == '__main__': argparser = argparse.ArgumentParser( prog='parsehar', description='Parse .har files into comma separated values (csv).') argparser.add_argument('harfile', type=str, nargs=2, help='path to harfile to be processed.') args = argparser.parse_args() current_site=args.harfile[1] main(args.harfile[0]) read_files(); rules=AdblockRules(raw_rules,supported_options=['third-party','script','image','stylesheet','domain','object','subdocument','xmlhttprequest','websocket','webrtc','popup','generichide','genericblock'],skip_unsupported_rules=False) print(rules) no_sites_blocked=0 for dm in domains: options=write_options() if(rules.should_block(dm,options)): no_sites_blocked+=1 get_blocked_sites_domain(dm) print(tabulate([[ current_site, total_no_sites , no_sites_blocked,blocked_sites ]], headers=['Site' ,'# of total HTTP requests', '# of HTTP requests blocked' ,'Third-party domains (not URL) blocked']))
raw_rules.append(line) with open("/user/ifouad/home/PycharmProjects/OpenWpm/addblock/easyprivacy", 'r') as f: for line in f: raw_rules_easyp.append(line) rules = AdblockRules(raw_rules) rules_priv = AdblockRules(raw_rules_easyp) db2 = sys.argv[1] conn = sqlite3.connect(db2) curr = conn.cursor() for site_id, link_id, response_id, url in curr.execute( 'select site_id, link_id, response_id, url from http_responses where link_id = 0 order by site_id ASC' ).fetchall(): print site_id, link_id, url if rules.should_block(url): cur2.execute( "insert into blocked (site_id , link_id , resp_id , url, list ) Values (?,?,?,?,?)", (site_id, link_id, response_id, url, "easylist")) elif rules_priv.should_block(url): cur2.execute( "insert into blocked (site_id , link_id , resp_id , url, list ) Values (?,?,?,?,?)", (site_id, link_id, response_id, url, "easyprivacy")) else: cur2.execute( "insert into blocked (site_id , link_id , resp_id , url ) Values (?,?,?,?)", (site_id, link_id, response_id, url)) ''' print rules.should_block("http://search.ch/htmlbanner.html") print rules.should_block("g.doubleclick.net") print rules.should_block("http://ads.example.com/notbanner", {'script': False})
# Load url dataset print("Loading URLs...") urls = [] with open(URL_PATH, 'r') as infile: for line in infile: url, _ = line.strip().split("\t") url = url[1:-2] urls.append(url) print len(urls), "URLs loaded!" # Make a mapping from urls to whether they should be blocked print("Parsing URLs...") block_map = Counter() for url in urls: block_map[url] = rules.should_block(url) print("Finished!") # Save mapping to pickle with open(URL_MAP_PKL_PATH, 'wb') as output: # Overwrites any existing file. pickle.dump(block_map, output, pickle.HIGHEST_PROTOCOL)
if "http" in url: third_party_tld = getDomain(url) if visited_tld != third_party_tld: # check if third-party temp_orga = get_organisation(blocklist_json, third_party_tld) if temp_orga is not None: if not temp_orga in all_third_party_orgas: all_third_party_orgas.append(temp_orga) if not temp_orga in third_party_sites: third_party_sites.append(temp_orga) else: if not third_party_tld in all_third_party_orgas: all_third_party_orgas.append(third_party_tld) if not third_party_tld in third_party_sites: third_party_sites.append(third_party_tld) if rules.should_block(url) is True: # check if tracking site if temp_orga is not None: if not temp_orga in all_ad_and_tracking_orgas: all_ad_and_tracking_orgas.append(temp_orga) if not temp_orga in tracking_sites: tracking_sites.append(temp_orga) else: if not third_party_tld in all_ad_and_tracking_orgas: all_ad_and_tracking_orgas.append(third_party_tld) if not third_party_tld in tracking_sites: tracking_sites.append(third_party_tld) else: raise ValueError('http is not in url!', url) resObject["third_party_sites"] = third_party_sites
soup = BeautifulSoup(body, features='html.parser') scripts = soup.find_all('script') srcs = [link['src'] for link in scripts if 'src' in link.attrs] # Test with known dodgy URL # srcs.append('//pushlat.com/ntfc.php?p=1273711139') # Set up caching sess = CacheControl(requests.Session(), FileCache(args.cachedir)) response = sess.get(args.blacklist) rules = AdblockRules(response.text.splitlines(), supported_options=['third-party'], skip_unsupported_rules=False) options = {'third-party': True} for src in srcs: if (rules.should_block(src, options)): crit_msg.append(args.url + " contains dodgy 'script src' parameter: " + src) else: scanned_srcs.append(src) ok_msg.append("None of the " + str(len(scanned_srcs)) + " found 'script src' URLs on " + args.url + " are listed in " + args.blacklist) if (args.verbose): ok_msg.append("\n".join(scanned_srcs)) except Exception as e: nagios_exit("UNKNOWN: Unknown error: {0}.".format(e), 3) # Exit with accumulated message(s)