class OrdinalSuffixTrie(PhraseFilter): def __init__(self, ordinal_rules): self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring( safe_decode(token[::-1])) if suffix_search: return suffix_search[0].split('|') else: return None
class OrdinalSuffixTrie(PhraseFilter): def __init__(self, ordinal_rules): self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)]) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1])) if suffix_search: return suffix_search[0].split('|') else: return None
class DictionaryPhraseFilter(PhraseFilter): def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} def serialize(self, s): return s def deserialize(self, s): return s def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename dictionary_name = filename.split('.', 1)[0] path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string( phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for p, t, data in self.basic_filter(tokens): if not p: t, c = t token = t token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get( token[(token_len - suffix_len):].rstrip('.')): yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search)) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search)) continue else: c = PHRASE yield t, c, len(t), map(safe_decode, data)
class DictionaryPhraseFilter(PhraseFilter): serialize = safe_encode deserialize = safe_decode def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} kvs = defaultdict(OrderedDict) for language in address_phrase_dictionaries.languages: for dictionary_name in self.dictionaries: is_suffix_dictionary = 'suffixes' in dictionary_name is_prefix_dictionary = 'prefixes' in dictionary_name for phrases in address_phrase_dictionaries.phrases.get( (language, dictionary_name), []): canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, language, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string( phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(language, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) def serialize(self, s): return s def deserialize(self, s): return s def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for p, t, data in self.basic_filter(tokens): if not p: t, c = t token = t token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get( token[(token_len - suffix_len):].rstrip('.')): yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search)) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search)) continue else: c = token_types.PHRASE yield t, c, len(t), map(safe_decode, data) def gen_phrases(self, s, canonical_only=False, languages=None): tokens = tokenize(s) norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] if not languages: languages = None elif not hasattr(languages, '__iter__'): languages = [languages] if not hasattr(languages, '__contains__'): languages = set(languages) for t, c, length, data in self.filter(norm_tokens): if c == token_types.PHRASE: if not canonical_only and languages is None: yield six.u(' ').join([t_i for t_i, c_i in t]) else: phrase = None for d in data: lang, dictionary, is_canonical, canonical = d.split( six.b('|')) if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'): phrase = phrase if phrase is not None else six.u( ' ').join([t_i for t_i, c_i in t]) yield phrase def string_contains_phrases(self, s, canonical_only=False, languages=None): phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages) try: phrases.next() return True except StopIteration: return False def extract_phrases(self, s, canonical_only=False, languages=None): return set( self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
class DictionaryPhraseFilter(PhraseFilter): serialize = safe_encode deserialize = safe_decode def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} kvs = defaultdict(OrderedDict) for language in address_phrase_dictionaries.languages: for dictionary_name in self.dictionaries: is_suffix_dictionary = 'suffixes' in dictionary_name is_prefix_dictionary = 'prefixes' in dictionary_name for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []): canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, language, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string(phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(language, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) def serialize(self, s): return s def deserialize(self, s): return s def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for p, t, data in self.basic_filter(tokens): if not p: t, c = t token = t token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')): yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search)) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search)) continue else: c = token_types.PHRASE yield t, c, len(t), map(safe_decode, data) def gen_phrases(self, s, canonical_only=False, languages=None): tokens = tokenize(s) norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] if not languages: languages = None elif not hasattr(languages, '__iter__'): languages = [languages] if not hasattr(languages, '__contains__'): languages = set(languages) for t, c, length, data in self.filter(norm_tokens): if c == token_types.PHRASE: if not canonical_only and languages is None: yield six.u(' ').join([t_i for t_i, c_i in t]) else: phrase = None for d in data: lang, dictionary, is_canonical, canonical = d.split(six.b('|')) if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'): phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t]) yield phrase def string_contains_phrases(self, s, canonical_only=False, languages=None): phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages) try: phrases.next() return True except StopIteration: return False def extract_phrases(self, s, canonical_only=False, languages=None): return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))
def test_bytestrie(): btrie = BytesTrie() btrie.mmap('tests/simple.bytestrie') print 'start bytes trie-------' print btrie.get('foo') print 'end bytes trie-------'
class DictionaryPhraseFilter(PhraseFilter): def __init__(self, *dictionaries): self.dictionaries = dictionaries self.canonicals = {} def serialize(self, s): return s def deserialize(self, s): return s def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename dictionary_name = filename.split('.', 1)[0] path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = phrases[0] canonical_normalized = normalize_string(canonical) self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:] for i, phrase in enumerate(phrases): if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = normalize_string(phrase) == canonical_normalized if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for p, t, data in self.basic_filter(tokens): if not p: t, c = t token = t token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')): yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search)) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search)) continue else: c = PHRASE yield t, c, len(t), map(safe_decode, data)
class DictionaryPhraseFilter(PhraseFilter): def __init__(self, *dictionaries): self.dictionaries = dictionaries def serialize(self, s): return s def deserialize(self, s): return s def configure(self, base_dir=DICTIONARIES_DIR): kvs = defaultdict(OrderedDict) for lang in os.listdir(DICTIONARIES_DIR): for filename in self.dictionaries: is_suffix_dictionary = 'suffixes' in filename is_prefix_dictionary = 'prefixes' in filename is_street_types_dictionary = 'street_types' in filename is_stopword_dictionary = 'stopwords' in filename path = os.path.join(DICTIONARIES_DIR, lang, filename) if not os.path.exists(path): continue for line in open(path): line = line.strip() if not line: continue phrases = safe_decode(line).split(u'|') if not phrases: continue canonical = strip_accents(phrases[0]) for phrase in phrases: if phrase in POSSIBLE_ROMAN_NUMERALS: continue is_canonical = strip_accents(phrase) == canonical if is_suffix_dictionary: phrase = SUFFIX_KEY + phrase[::-1] elif is_prefix_dictionary: phrase = PREFIX_KEY + phrase if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary: kvs[phrase][lang] = (is_canonical, is_stopword_dictionary) kvs = [(k, '|'.join([v, str(int(c)), str(int(s))])) for k, vals in kvs.iteritems() for v, (c, s) in vals.iteritems()] self.trie = BytesTrie(kvs) self.configured = True def search_substring(self, s): if len(s) == 0: return None, 0 for i in xrange(len(s) + 1): if not self.trie.has_keys_with_prefix(s[:i]): i -= 1 break if i > 0: return (self.trie.get(s[:i]), i) else: return None, 0 def search_suffix(self, token): suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1]) if suffix_len > 0: suffix_len -= len(SUFFIX_KEY) return suffix_search, suffix_len def search_prefix(self, token): prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token) if prefix_len > 0: prefix_len -= len(PREFIX_KEY) return prefix_search, prefix_len def basic_filter(self, tokens): return super(DictionaryPhraseFilter, self).filter(tokens) def filter(self, tokens): for c, t, data in self.basic_filter(tokens): if c is not token_types.PHRASE: token = t[1] token_len = len(token) suffix_search, suffix_len = self.search_suffix(token) if suffix_search and self.trie.get( token[(token_len - suffix_len):].rstrip('.')): yield (token_types.PHRASE, [(c, ) + t], suffix_search) continue prefix_search, prefix_len = self.search_prefix(token) if prefix_search and self.trie.get(token[:prefix_len]): yield (token_types.PHRASE, [(c, ) + t], prefix_search) continue yield c, t, data