def __init__(self, categories=None, parser_ids=None): PatternMatching.__init__(self) # Load parser list tags = [] if categories: tags += [ ("category", cat) for cat in categories ] if parser_ids: tags += [ ("id", parser_id) for parser_id in parser_ids ] if tags : tags += [ None ] parser_list = QueryParser(tags) # Create string patterns for parser in parser_list: for (magic, offset) in parser.getParserTags().get("magic",()): self.addString(magic, (offset, parser)) # Create regex patterns for parser in parser_list: for (regex, offset) in parser.getParserTags().get("magic_regex",()): self.addRegex(regex, (offset, parser)) self.commit()
def search(self, data): for start, stop, item in PatternMatching.search(self, data): yield (item.user[1], start*8 - item.user[0])
def __add_items_2_sqlite(self, fd, category, is_black, itype): if itype == "domain" : domains = [] for line in fd.readlines() : dg_domain=line.replace("\r","").replace("\n", "").replace(" ","").replace('"','').replace("'",'') tmp_domain='' tmp_domain_item_list = dg_domain.split(".") tmp_domain_item_list.reverse() for x in tmp_domain_item_list: tmp_domain = tmp_domain + x + "." tmp_domain=tmp_domain[:-1] domains.append(tmp_domain) domains.sort() p = PatternMatching() i = 0 step = False total = len(domains) current = 0 for domain in domains : string = try_to_str(domain) if string == None : continue p.addString(string) i = i + 1 current = current + 1 if i < 1500 : continue if step == False and i % 500 == 0 : if len(str(p.regex)) > 20000 : if len(str(p.regex)) > 24000 : self.__insert_domain_into_sqlite(category, str(p.regex), is_black, current, total ) p = PatternMatching() step = False i = 0 continue step = True continue elif step == True and i % 100 == 0 : if len(str(p.regex)) > 25000 : self.__insert_domain_into_sqlite(category, str(p.regex), is_black, current, total) p = PatternMatching() step = False i = 0 if len(str(p.regex)) > 0 : self.__insert_domain_into_sqlite(category, str(p.regex), is_black, total, total) else: domain_set = set() urls = [] for line in fd.readlines() : dg_url = line.replace("\r","").replace("\n", "").replace(" ","").replace('"','').replace("'",'') urls.append(dg_url) if is_black == True: tmp_domain='' tmp_domain_item_list = dg_url.split("/")[0].split(".") tmp_domain_item_list.reverse() for x in tmp_domain_item_list: tmp_domain = tmp_domain + x + "." tmp_domain=tmp_domain[:-1] domain_set.add(tmp_domain) urls.sort() p = PatternMatching() i = 0 current = 0 if is_black == True : total = len(urls) + len(domain_set) else: total = len(urls) for url in urls : string = try_to_str(url) if string == None : continue p.addString(string) i = i + 1 current = current + 1 if i % 100 == 0 : if len(str(p.regex)) > 25000 : self.__insert_url_into_sqlite(category, str(p.regex), is_black, current, total) p = PatternMatching() i = 0 if len(str(p.regex)) > 0 : self.__insert_url_into_sqlite(category, str(p.regex), is_black, len(urls) , total) if is_black == True: domains = list(domain_set) domains.sort() p = PatternMatching() i = 0 step = False for domain in domains : string = try_to_str(domain) if string == None : continue p.addString(string) i = i + 1 current = current + 1 if i < 1500 : continue if step == False and i % 500 == 0 : if len(str(p.regex)) > 20000 : if len(str(p.regex)) > 24000 : self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black, current, total) p = PatternMatching() step = False i = 0 continue step = True continue elif step == True and i % 100 == 0 : if len(str(p.regex)) > 25000 : self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black, current, total) p = PatternMatching() step = False i = 0 if len(str(p.regex)) > 0 : self.__insert_domain_into_sqlite("may_url_blocked", str(p.regex), is_black, total, total)