def compile_hs(self, hs_db_file=None): if hs_db_file and os.path.isfile(hs_db_file): log.info(f'Loading Hyperscan DB from disk: {hs_db_file}') with open(hs_db_file, 'rb') as f: self.hs_db = hyperscan.loads(bytearray(f.read())) return self.hs_db = hyperscan.Database() num_patterns = len(self.regexes) expressions = list( map( functools.partial(str.encode, encoding='utf-8'), map(operator.itemgetter(self.Regex._fields.index('pattern')), self.regexes))) ids = list(range(num_patterns)) flags = list( map(operator.itemgetter(self.Regex._fields.index('hs_flags')), self.regexes)) start = time.time() self.hs_db.compile( expressions=expressions, ids=ids, elements=num_patterns, flags=flags, ) self.stats.hs_compilation_time = time.time() - start if hs_db_file: log.info(f'Saving Hyperscan DB to disk: {hs_db_file}') with open(hs_db_file, 'wb') as f: f.write(hyperscan.dumps(self.hs_db))
def database_stream(): db = hyperscan.Database(mode=(hyperscan.HS_MODE_STREAM | hyperscan.HS_MODE_SOM_HORIZON_LARGE)) expressions, ids, flags = zip(*patterns) db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags) return db
def get_starting_db_exprs() -> tuple[hyperscan.Database | None, set[str]]: if SERIALIZED_PATH.exists() and EXPRESSIONS_PATH.exists(): with contextlib.suppress(Exception): with SERIALIZED_PATH.open(mode="rb") as fp_r: db = hyperscan.loadb(fp_r.read()) with EXPRESSIONS_PATH.open(mode="r") as fp: expressions = {e.strip() for e in fp.readlines() if e} return db, expressions if EXPRESSIONS_PATH.exists(): with EXPRESSIONS_PATH.open(mode="r") as fp: expressions = {e.strip() for e in fp.readlines() if e} if expressions: try: db = hyperscan.Database() db.compile(expressions=tuple(expr.encode() for expr in expressions)) except Exception as exc: log.exception("Error loading in expressions from file", exc_info=exc) else: return db, expressions else: return None, expressions return None, set()
def benchmark_hyperscan(LINE): import hyperscan db = hyperscan.Database() db.compile( expressions=[k.encode("utf-8") for k in KEYS], ids=list(range(len(KEYS))), elements=len(KEYS), flags=0, ) l = [] def on_match(id: int, from_: int, to: int, flags: int, context: Optional[Any] = None) -> Optional[bool]: l.append(id) LINE = LINE.encode("utf-8") db.scan(LINE, match_event_handler=on_match) print(l) # TODO the Python hyperscan API I found initially is extremely inefficient, # doing a Python callback on every match. So not doing any match handling # here, just to get a sense of raw performance. benchmark("db.scan(LINE)", locals())
def database_block(): db = hyperscan.Database() expressions, ids, flags = zip(*patterns) db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags) return db
def compile_db_in_memory(expressions: List[bytes], ids: List[int], flags: List[int]) -> hs.Database: assert len(expressions) == len( ids), "There must be an id for every expression." db = hs.Database(mode=HYPERSCAN_DB_MODE) db.compile(expressions=expressions, ids=ids, flags=flags) return db
def compile_database(pattern_set): db = hyperscan.Database() ### Compile patterns expressions, ids, flags = zip(*patterns) db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags) print(db.info().decode())
def test_literal_expressions(mocker): db = hyperscan.Database() expressions, ids, _ = zip(*patterns) expressions = [e + b'\0' for e in expressions] db.compile(expressions=expressions, ids=ids, literal=True) callback = mocker.Mock(return_value=None) expected = [] for i, expression in enumerate(expressions): expression = expression[:-1] db.scan(expression, match_event_handler=callback, context=expression) expected.append(mocker.call(ids[i], 0, len(expression), 0, expression)) assert callback.mock_calls == expected
def build_database(expr_path, mode=hyperscan.HS_MODE_STREAM): ids = [] expressions = [] flags = [] with io.open(expr_path, 'r') as f: for line in f: id_, expression, flags_ = process_expression(line) ids.append(id_) expressions.append(expression) flags.append(flags_) database = hyperscan.Database(mode=mode) database.compile(expressions=expressions, ids=ids, flags=flags) return len(expressions), database
def update_db_from_expressions(db: hyperscan.Database | None, expressions: set[str]) -> hyperscan.Database | None: log.info("Updating expressions to %s", expressions) if expressions: if not db: db = hyperscan.Database() db.compile(expressions=tuple(expr.encode() for expr in expressions)) atomic_save(SERIALIZED_PATH, hyperscan.dumpb(db)) else: db = None atomic_save(EXPRESSIONS_PATH, "\n".join(expressions).encode()) return db
def compile_test(): db = hyperscan.Database() patterns = ( # expression, id, flags (br'fo+', 0, 0), (br'^foobar$', 1, hyperscan.HS_FLAG_CASELESS), (br'BAR', 2, hyperscan.HS_FLAG_CASELESS | hyperscan.HS_FLAG_SOM_LEFTMOST), ) expressions, ids, flags = zip(*patterns) db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags) print(db.info().decode())
def stream(self, rules): """ Load the hyperscan database. """ self._stream = hyperscan.Database(mode=hyperscan.HS_MODE_BLOCK) patterns = [] for r in rules: rule_id, rule, _, _ = r.values() patterns.append( (rule.encode('utf-8'), rule_id, hyperscan.HS_FLAG_CASELESS | hyperscan.HS_FLAG_UTF8 | hyperscan.HS_FLAG_UCP)) expressions, ids, flags = zip(*patterns) self._stream.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags)
def hyperscan_match(regexes, text): """Run regexes on text using hyperscan, for debugging.""" # import here so the dependency is optional import hyperscan # pylint: disable=import-outside-toplevel flags = [hyperscan.HS_FLAG_SOM_LEFTMOST] * len(regexes) regexes = [regex.encode("utf8") for regex in regexes] hyperscan_db = hyperscan.Database() hyperscan_db.compile(expressions=regexes, flags=flags) matches = [] def on_match(index, start, end, flags, context): matches.append((index, start, end, flags, context)) hyperscan_db.scan(text.encode("utf8"), on_match) return matches
def hyperscan_db(self): """Compile extractors into a hyperscan DB. Use a cache file if we've compiled this set before.""" if not hasattr(self, "_db"): # import here so the dependency is optional import hyperscan # pylint: disable=import-outside-toplevel hyperscan_db = None cache = None flag_conversion = {re.I: hyperscan.HS_FLAG_CASELESS} def convert_flags(re_flags): hyperscan_flags = 0 for re_flag, hyperscan_flag in flag_conversion.items(): if re_flags & re_flag: hyperscan_flags |= hyperscan_flag return hyperscan_flags def convert_regex(regex): # hyperscan doesn't understand repetition flags like {,3}, # so replace with {0,3}: regex = re.sub(r"\{,(\d+)\}", r"{0,\1}", regex) # Characters like "§" convert to more than one byte in utf8, # so "§?" won't work as expected. Convert "§?" to "(?:§)?": long_chars = [c for c in regex if len(c.encode("utf8")) > 1] if long_chars: regex = re.sub( rf'([{"".join(set(long_chars))}])\?', r"(?:\1)?", regex ) # encode as bytes: return regex.encode("utf8") expressions = [convert_regex(e.regex) for e in self.extractors] # HS_FLAG_SOM_LEFTMOST so hyperscan includes the start offset flags = [ convert_flags(e.flags) | hyperscan.HS_FLAG_SOM_LEFTMOST for e in self.extractors ] if self.cache_dir is not None: # Attempt to use cache. # Cache key is a hash of all regexes and flags, so we # automatically recompile if anything changes. fingerprint = hashlib.md5( str(expressions).encode("utf8") + str(flags).encode("utf8") ).hexdigest() cache_dir = Path(self.cache_dir) cache_dir.mkdir(exist_ok=True) cache = cache_dir / fingerprint if cache.exists(): hyperscan_db = hyperscan.loadb(cache.read_bytes()) if not hyperscan_db: # No cache, so compile database. hyperscan_db = hyperscan.Database() hyperscan_db.compile(expressions=expressions, flags=flags) if cache: cache.write_bytes(hyperscan.dumpb(hyperscan_db)) self._db = hyperscan_db return self._db
import hyperscan db = hyperscan.Database() patterns = ( # expression, id, flags (br'fo+', 0, 0), (br'^foobar$', 1, hyperscan.HS_FLAG_CASELESS), (br'BAR', 2, hyperscan.HS_FLAG_CASELESS | hyperscan.HS_FLAG_SOM_LEFTMOST), ) expressions, ids, flags = zip(*patterns) db.compile(expressions=expressions, ids=ids, elements=len(patterns), flags=flags) print(db.info().decode()) # Version: 5.1.1 Features: AVX2 Mode: BLOCK