def main(): def set_stdio_encoding(enc=NATIVE): import codecs stdio = ["stdin", "stdout", "stderr"] for x in stdio: obj = getattr(sys, x) if not obj.encoding: setattr(sys, x, codecs.getwriter(enc)(obj)) set_stdio_encoding() log_level = log.INFO log.basicConfig(format="%(levelname)s>> %(message)s", level=log_level) with io.open(sys.argv[1], encoding="UTF-8") as fd: #adfilter = AdblockRulesLite(fd, supported_options=["third-party"], skip_unsupported_rules=False) adfilter = adblockparser.AdblockRules( fd, supported_options=["third-party"], skip_unsupported_rules=False) import time n = 1000 a = time.time() for i in range(n): adfilter.should_block("http://www.events.kaloooga.com/stom", {"third-party": False}) b = time.time() total = b - a print(total, total / n) ret = adfilter.should_block("http://www.events.kaloooga.com/stom", {"third-party": False})
def test_easylist_filter(): urls_to_be_blocked = _create_sample_urls() rules = adblockparser.AdblockRules(_create_sample_easylist()) for url, to_be_blocked in urls_to_be_blocked: result = rules.should_block(url) # "http://ads.example.com" assert result == to_be_blocked
def setup(self) -> None: """Child function.""" asyncio.run(self._setup_downloads()) if self.tag_list: self._extract_date_from_list() self._prepare_tag_list() if self.extraction_method == ExtractionMethod.USE_ADBLOCK_PARSER: self.match_rules = adblockparser.AdblockRules( self.tag_list, skip_unsupported_rules=False, use_re2=False)
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data self.sf.debug("Received event, " + eventName + ", from " + srcModuleName) if self.errorState: return None if self.rules is None: raw = self.sf.fetchUrl(self.opts['blocklist'], timeout=30) if raw['content'] is not None: lines = raw['content'].split('\n') self.sf.debug("RULE LINES: " + str(len(lines))) try: self.rules = adblockparser.AdblockRules(lines) except BaseException as e: self.errorState = True self.sf.error( "Parsing error handling AdBlock list: " + str(e), False) else: self.errorState = True self.sf.error( "Unable to download AdBlockPlus list: " + self.opts['blocklist'], False) if "_EXTERNAL" in eventName: pagetype = "_EXTERNAL" else: pagetype = "_INTERNAL" if eventData not in self.results: self.results.append(eventData) else: self.sf.debug( "Already checked this page for AdBlock matching, skipping.") return None try: if self.rules and self.rules.should_block(eventData): evt = SpiderFootEvent("URL_ADBLOCKED" + pagetype, eventData, self.__name__, event) self.notifyListeners(evt) except BaseException as e: self.sf.error("Parsing error handling AdBlock list: " + str(e), False) self.errorState = True return None
def setBlocklistRules(self, blocklist): """Parse AdBlock Plus blocklist and set blocklist rules Args: blocklist (str): plaintext AdBlock Plus blocklist """ if not blocklist: return lines = blocklist.split('\n') self.debug(f"Retrieved {len(lines)} AdBlock blocklist rules") try: self.rules = adblockparser.AdblockRules(lines) except adblockparser.AdblockParsingError as e: self.errorState = True self.error(f"Parsing error handling AdBlock list: {e}")
def _load(self, path): try: import adblockparser except ImportError: log.msg('WARNING: https://github.com/scrapinghub/adblockparser ' 'library is not available, filters are not loaded.') return for fname in os.listdir(path): if not fname.endswith('.txt'): continue fpath = os.path.join(path, fname) name = fname[:-len('.txt')] if not os.path.isfile(fpath): continue if self.verbosity >= 1: log.msg("Loading filter %s" % name) with open(fpath, 'rb') as f: lines = [line.decode('utf8').strip() for line in f] rules = adblockparser.AdblockRules( lines, supported_options=self.supported_options, skip_unsupported_rules=False, max_mem=512 * 1024 * 1024, # this doesn't actually use 512M ) filters_num = len(rules.rules) if self.verbosity >= 2: log.msg("%d rule(s) loaded for filter %s" % (filters_num, name)) if not rules.uses_re2 and filters_num > self.RE2_WARN_THRESHOLD: log.msg('WARNING: a filter %s with %d rules loaded, but ' 'pyre2 library is not installed. Matching may become ' 'slow; installing https://github.com/axiak/pyre2 is ' 'highly recommended.' % (name, filters_num)) self.filters[name] = rules
def __init__(self, rules, no_whitelist): """Initializes an instance of _RulesMatcher. Args: rules: ([str]) list of rules. no_whitelist: (bool) Whether the whitelisting rules should be ignored. """ self._rules = self._FilterRules(rules, no_whitelist) if self._rules: try: import adblockparser self._matcher = adblockparser.AdblockRules(self._rules) except ImportError: logging.critical( 'Likely you need to install adblockparser. Try:\n' ' pip install --user adblockparser\n' 'For 10-100x better performance, also try:\n' " pip install --user 're2 >= 0.2.21'") raise else: self._matcher = None
def find_third_party_using_given_csv(csv_file, script_dir): with open(script_dir + '/dst_characterize/easylist_adblock.txt', 'r') as f: ad_rules = adblockparser.AdblockRules(f.readlines()) f.close() ad_list = set() with open(csv_file, mode="r") as csv_file1: csv_reader = csv.DictReader(csv_file1) visited_domains = [ ] # don't need to run domains through ad rules more than once for row in csv_reader: current_domain: str = row[options[3]] if current_domain not in visited_domains: visited_domains.append(current_domain) current_domain_full = "http://" + current_domain + "/" if ad_rules.should_block(current_domain_full): ad_list.add(current_domain) else: current_domain_full = "https://" + current_domain + "/" if ad_rules.should_block(current_domain_full): ad_list.add(current_domain) csv_file1.close() return ad_list
Data.timeout = int(config["timeout"]) Data.rules_file = config["rules_file"] Data.url_file = config["url_file"] Data.portRequest = int(config["portRequest"]) Data.portMessage = int(config["portMessage"]) Data.ip = config["ip"] print("Reading AdblockRule Textfile") f = open(Data.rules_file, "r") raw = f.read().split("\n") raw_rules = [] for r in raw: raw_rules.append(r) f.close() print("Generating AdblockRules") Data.rules = adblockparser.AdblockRules(raw_rules) print("Generating Database") analyze.createDatabase() print("Reading Urllist Textfile") f = open(Data.url_file, "r") li = f.read() f.close() li = li.split("\n") #counter=int(random.random()*100000) counter = 0 for l in li: if counter > 0: