def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._regexes_or_assoc): # # First we compile all regular expressions and save them to # the re_cache. # if isinstance(item, tuple): regex = item[0] regex = regex.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) if regex in self._translator: raise ValueError('Duplicated regex "%s"' % regex) self._translator[regex] = item[1:] elif isinstance(item, basestring): regex = item.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) else: raise ValueError('Can NOT build MultiRE with provided values.') # # Now we extract the string literals (longer than hint_len only) from # the regular expressions and populate the acora index # regex_hints = esmre.hints(regex) regex_keywords = esmre.shortlist(regex_hints) if not regex_keywords: self._regexes_with_no_keywords.append(regex) continue # Get the longest one regex_keyword = regex_keywords[0] if len(regex_keyword) <= self._hint_len: self._regexes_with_no_keywords.append(regex) continue # Add this keyword to the acora index, and also save a way to associate the # keyword with the regular expression regex_keyword = regex_keyword.lower() builder.add(regex_keyword) regexes_matching_keyword = self._keyword_to_re.get( regex_keyword, []) regexes_matching_keyword.append(regex) self._keyword_to_re[regex_keyword] = regexes_matching_keyword return builder.build()
def _build(self): builder = AcoraBuilder() for idx, item in enumerate(self._regexes_or_assoc): # # First we compile all regular expressions and save them to # the re_cache. # if isinstance(item, tuple): regex = item[0] regex = regex.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) if regex in self._translator: raise ValueError('Duplicated regex "%s"' % regex) self._translator[regex] = item[1:] elif isinstance(item, basestring): regex = item.encode(DEFAULT_ENCODING) self._re_cache[regex] = re.compile(regex, self._re_compile_flags) else: raise ValueError('Can NOT build MultiRE with provided values.') # # Now we extract the string literals (longer than hint_len only) from # the regular expressions and populate the acora index # regex_hints = esmre.hints(regex) regex_keywords = esmre.shortlist(regex_hints) if not regex_keywords: self._regexes_with_no_keywords.append(regex) continue # Get the longest one regex_keyword = regex_keywords[0] if len(regex_keyword) <= self._hint_len: self._regexes_with_no_keywords.append(regex) continue # Add this keyword to the acora index, and also save a way to associate the # keyword with the regular expression regex_keyword = regex_keyword.lower() builder.add(regex_keyword) regexes_matching_keyword = self._keyword_to_re.get(regex_keyword, []) regexes_matching_keyword.append(regex) self._keyword_to_re[regex_keyword] = regexes_matching_keyword return builder.build()
def enter(self, regex, obj): self.lock.acquire() try: if self.fixed: raise TypeError("enter() cannot be called after query()") regex_hints = esmre.hints(regex) keywords = esmre.shortlist(regex_hints) if not keywords: raise ValueError('Failed due to performance reasons.' ' Need more hints for RE: %s' % regex) for hint in keywords: if len(hint) <= self.hint_len: raise ValueError('Failed due to performance reasons.' ' Need longer hints for RE: %s' % regex) self.esm.enter(hint.lower(), obj) finally: self.lock.release()
def checkShortlist(self, expected_shortlist, hints): self.assertEqual(expected_shortlist, esmre.shortlist(hints))