示例#1
0
class OrdinalSuffixTrie(PhraseFilter):
    def __init__(self, ordinal_rules):
        self.trie = BytesTrie([(safe_decode(k)[::-1],
                                safe_decode('|').join(v).encode('utf-8'))
                               for k, v in six.iteritems(ordinal_rules)])
        self.configured = True

    def search_substring(self, s):
        if len(s) == 0:
            return None, 0

        for i in xrange(len(s) + 1):
            if not self.trie.has_keys_with_prefix(s[:i]):
                i -= 1
                break
        if i > 0:
            return (self.trie.get(s[:i]), i)
        else:
            return None, 0

    def search_suffix(self, token):
        suffix_search, suffix_len = self.search_substring(
            safe_decode(token[::-1]))
        if suffix_search:
            return suffix_search[0].split('|')
        else:
            return None
示例#2
0
    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename

                dictionary_name = filename.split('.', 1)[0]

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue

                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, lang,
                                     dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(
                            phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(lang, dictionary_name,
                                     canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)),
                             safe_encode(c)])) for k, vals in kvs.iteritems()
               for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True
示例#3
0
    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries
        self.canonicals = {}

        kvs = defaultdict(OrderedDict)

        for language in address_phrase_dictionaries.languages:
            for dictionary_name in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in dictionary_name
                is_prefix_dictionary = 'prefixes' in dictionary_name

                for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(language, dictionary_name, canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
示例#4
0
    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename
                is_street_types_dictionary = 'street_types' in filename
                is_stopword_dictionary = 'stopwords' in filename

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue
                    canonical = strip_accents(phrases[0])

                    for phrase in phrases:

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = strip_accents(phrase) == canonical

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary:
                            kvs[phrase][lang] = (is_canonical,
                                                 is_stopword_dictionary)

        kvs = [(k, '|'.join([v, str(int(c)), str(int(s))]))
               for k, vals in kvs.iteritems()
               for v, (c, s) in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True
示例#5
0
    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries
        self.canonicals = {}

        kvs = defaultdict(OrderedDict)

        for language in address_phrase_dictionaries.languages:
            for dictionary_name in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in dictionary_name
                is_prefix_dictionary = 'prefixes' in dictionary_name

                for phrases in address_phrase_dictionaries.phrases.get(
                    (language, dictionary_name), []):
                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, language,
                                     dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(
                            phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(language, dictionary_name,
                                     canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)),
                             safe_encode(c)])) for k, vals in kvs.iteritems()
               for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
示例#6
0
class OrdinalSuffixTrie(PhraseFilter):
    def __init__(self, ordinal_rules):
        self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
        self.configured = True

    def search_substring(self, s):
        if len(s) == 0:
            return None, 0

        for i in xrange(len(s) + 1):
            if not self.trie.has_keys_with_prefix(s[:i]):
                i -= 1
                break
        if i > 0:
            return (self.trie.get(s[:i]), i)
        else:
            return None, 0

    def search_suffix(self, token):
        suffix_search, suffix_len = self.search_substring(safe_decode(token[::-1]))
        if suffix_search:
            return suffix_search[0].split('|')
        else:
            return None
示例#7
0
    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename

                dictionary_name = filename.split('.', 1)[0]

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue

                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True
示例#8
0
import json
import sys
from marisa_trie import BytesTrie

if __name__ == "__main__":
    lang = sys.argv[1]
    print("load mention_stat")
    with open("./mention_stat_{}.json".format(lang)) as f:
        data = json.load(f)

    print("mention_stat to trie")
    trie = BytesTrie([(k, bytes(json.dumps(v), "utf-8"))
                      for k, v in data.items()])

    print("saving...")
    trie.save("mention_stat_{}.marisa".format(lang))

    print("Done!")
示例#9
0
class DictionaryPhraseFilter(PhraseFilter):
    serialize = safe_encode
    deserialize = safe_decode

    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries
        self.canonicals = {}

        kvs = defaultdict(OrderedDict)

        for language in address_phrase_dictionaries.languages:
            for dictionary_name in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in dictionary_name
                is_prefix_dictionary = 'prefixes' in dictionary_name

                for phrases in address_phrase_dictionaries.phrases.get(
                    (language, dictionary_name), []):
                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, language,
                                     dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(
                            phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(language, dictionary_name,
                                     canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)),
                             safe_encode(c)])) for k, vals in kvs.iteritems()
               for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)

    def serialize(self, s):
        return s

    def deserialize(self, s):
        return s

    def search_substring(self, s):
        if len(s) == 0:
            return None, 0

        for i in xrange(len(s) + 1):
            if not self.trie.has_keys_with_prefix(s[:i]):
                i -= 1
                break
        if i > 0:
            return (self.trie.get(s[:i]), i)
        else:
            return None, 0

    def search_suffix(self, token):
        suffix_search, suffix_len = self.search_substring(SUFFIX_KEY +
                                                          token[::-1])
        if suffix_len > 0:
            suffix_len -= len(SUFFIX_KEY)
        return suffix_search, suffix_len

    def search_prefix(self, token):
        prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
        if prefix_len > 0:
            prefix_len -= len(PREFIX_KEY)
        return prefix_search, prefix_len

    def basic_filter(self, tokens):
        return super(DictionaryPhraseFilter, self).filter(tokens)

    def filter(self, tokens):
        for p, t, data in self.basic_filter(tokens):
            if not p:
                t, c = t
                token = t
                token_len = len(token)

                suffix_search, suffix_len = self.search_suffix(token)
                if suffix_search and self.trie.get(
                        token[(token_len - suffix_len):].rstrip('.')):
                    yield ([(t, c)], token_types.PHRASE, suffix_len,
                           map(safe_decode, suffix_search))
                    continue
                prefix_search, prefix_len = self.search_prefix(token)
                if prefix_search and self.trie.get(token[:prefix_len]):
                    yield ([(t, c)], token_types.PHRASE, prefix_len,
                           map(safe_decode, prefix_search))
                    continue
            else:
                c = token_types.PHRASE
            yield t, c, len(t), map(safe_decode, data)

    def gen_phrases(self, s, canonical_only=False, languages=None):
        tokens = tokenize(s)
        norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t,
                        c) for t, c in tokens]

        if not languages:
            languages = None
        elif not hasattr(languages, '__iter__'):
            languages = [languages]

        if not hasattr(languages, '__contains__'):
            languages = set(languages)

        for t, c, length, data in self.filter(norm_tokens):
            if c == token_types.PHRASE:
                if not canonical_only and languages is None:
                    yield six.u(' ').join([t_i for t_i, c_i in t])
                else:
                    phrase = None
                    for d in data:
                        lang, dictionary, is_canonical, canonical = d.split(
                            six.b('|'))

                        if (bool(int(is_canonical)) or
                                not canonical_only) and (languages is None
                                                         or lang in languages
                                                         or lang == 'all'):
                            phrase = phrase if phrase is not None else six.u(
                                ' ').join([t_i for t_i, c_i in t])
                            yield phrase

    def string_contains_phrases(self, s, canonical_only=False, languages=None):
        phrases = self.gen_phrases(s,
                                   canonical_only=canonical_only,
                                   languages=languages)
        try:
            phrases.next()
            return True
        except StopIteration:
            return False

    def extract_phrases(self, s, canonical_only=False, languages=None):
        return set(
            self.gen_phrases(s,
                             canonical_only=canonical_only,
                             languages=languages))
示例#10
0
        return None


def mention2encands(mention, wbegin, wend, mention_stat, id2title, llfile):
    try:
        cands = json.loads(mention_stat[mention][0])
        out = []
        total = 0
        for cand, count in cands.items():
            target = cand2en(cand, id2title, llfile)
            if target is not None and "(disambiguation)" not in target:
                out.append((target, count))
                total += count
        for i, (target, count) in enumerate(out):
            out[i] = (target, float(count) / float(total), (wbegin, wend))
        return out
    except KeyError:
        return None


if __name__ == "__main__":
    import sys
    from marisa_trie import BytesTrie
    mention = sys.argv[1]
    id2title = BytesTrie()
    id2title.load("../data/id2title.marisa")
    mention_stat = BytesTrie()
    mention_stat.load("../data/mention_stat_ja.marisa")
    dbpath = "../data/enwiki_page.db"
    print(mention2encands(mention, mention_stat, id2title, dbpath))
示例#11
0
 def _load_prefix(cls, resource_path: str) -> BytesTrie:
     prefixes = read_word_set(resource_filename(resource_path, 'prefix.txt'))
     prefixes = [e.split() for e in prefixes]
     return BytesTrie([(p[0], p[1].encode('utf-8')) for p in prefixes] + [(p[0] + '-', p[1].encode('utf-8')) for p in prefixes])
示例#12
0
 def __init__(self, ordinal_rules):
     self.trie = BytesTrie([(safe_decode(k)[::-1],
                             safe_decode('|').join(v).encode('utf-8'))
                            for k, v in six.iteritems(ordinal_rules)])
     self.configured = True
示例#13
0
class DictionaryPhraseFilter(PhraseFilter):
    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries

    def serialize(self, s):
        return s

    def deserialize(self, s):
        return s

    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename
                is_street_types_dictionary = 'street_types' in filename
                is_stopword_dictionary = 'stopwords' in filename

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue
                    canonical = strip_accents(phrases[0])

                    for phrase in phrases:

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = strip_accents(phrase) == canonical

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        if is_canonical or is_street_types_dictionary or is_prefix_dictionary or is_suffix_dictionary:
                            kvs[phrase][lang] = (is_canonical,
                                                 is_stopword_dictionary)

        kvs = [(k, '|'.join([v, str(int(c)), str(int(s))]))
               for k, vals in kvs.iteritems()
               for v, (c, s) in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True

    def search_substring(self, s):
        if len(s) == 0:
            return None, 0

        for i in xrange(len(s) + 1):
            if not self.trie.has_keys_with_prefix(s[:i]):
                i -= 1
                break
        if i > 0:
            return (self.trie.get(s[:i]), i)
        else:
            return None, 0

    def search_suffix(self, token):
        suffix_search, suffix_len = self.search_substring(SUFFIX_KEY +
                                                          token[::-1])
        if suffix_len > 0:
            suffix_len -= len(SUFFIX_KEY)
        return suffix_search, suffix_len

    def search_prefix(self, token):
        prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
        if prefix_len > 0:
            prefix_len -= len(PREFIX_KEY)
        return prefix_search, prefix_len

    def basic_filter(self, tokens):
        return super(DictionaryPhraseFilter, self).filter(tokens)

    def filter(self, tokens):
        for c, t, data in self.basic_filter(tokens):
            if c is not token_types.PHRASE:
                token = t[1]
                token_len = len(token)

                suffix_search, suffix_len = self.search_suffix(token)
                if suffix_search and self.trie.get(
                        token[(token_len - suffix_len):].rstrip('.')):
                    yield (token_types.PHRASE, [(c, ) + t], suffix_search)
                    continue
                prefix_search, prefix_len = self.search_prefix(token)
                if prefix_search and self.trie.get(token[:prefix_len]):
                    yield (token_types.PHRASE, [(c, ) + t], prefix_search)
                    continue
            yield c, t, data
示例#14
0
 def __init__(self, ordinal_rules):
     self.trie = BytesTrie([(safe_decode(k)[::-1], safe_decode('|').join(v).encode('utf-8')) for k, v in six.iteritems(ordinal_rules)])
     self.configured = True
示例#15
0
 def __init__(self, phrases):
     if hasattr(phrases, 'items'):
         phrases = six.iteritems(phrases)
     vals = [(safe_decode(key), self.serialize(val))
             for key, val in phrases]
     self.trie = BytesTrie(vals)
示例#16
0
def load(datafile="../data/mention_stat.marisa"):
    trie = BytesTrie()
    trie.load(datafile)
    return trie
示例#17
0
class DictionaryPhraseFilter(PhraseFilter):

    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries
        self.canonicals = {}

    def serialize(self, s):
        return s

    def deserialize(self, s):
        return s

    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename

                dictionary_name = filename.split('.', 1)[0]

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue

                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True

    def search_substring(self, s):
        if len(s) == 0:
            return None, 0

        for i in xrange(len(s) + 1):
            if not self.trie.has_keys_with_prefix(s[:i]):
                i -= 1
                break
        if i > 0:
            return (self.trie.get(s[:i]), i)
        else:
            return None, 0

    def search_suffix(self, token):
        suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
        if suffix_len > 0:
            suffix_len -= len(SUFFIX_KEY)
        return suffix_search, suffix_len

    def search_prefix(self, token):
        prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
        if prefix_len > 0:
            prefix_len -= len(PREFIX_KEY)
        return prefix_search, prefix_len

    def basic_filter(self, tokens):
        return super(DictionaryPhraseFilter, self).filter(tokens)

    def filter(self, tokens):
        for p, t, data in self.basic_filter(tokens):
            if not p:
                t, c = t
                token = t
                token_len = len(token)

                suffix_search, suffix_len = self.search_suffix(token)
                if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
                    yield ([(t, c)], PHRASE, suffix_len, map(safe_decode, suffix_search))
                    continue
                prefix_search, prefix_len = self.search_prefix(token)
                if prefix_search and self.trie.get(token[:prefix_len]):
                    yield ([(t, c)], PHRASE, prefix_len, map(safe_decode, prefix_search))
                    continue
            else:
                c = PHRASE
            yield t, c, len(t), map(safe_decode, data)
示例#18
0
class DictionaryPhraseFilter(PhraseFilter):
    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries
        self.canonicals = {}

    def serialize(self, s):
        return s

    def deserialize(self, s):
        return s

    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename

                dictionary_name = filename.split('.', 1)[0]

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue

                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, lang,
                                     dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(
                            phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(lang, dictionary_name,
                                     canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)),
                             safe_encode(c)])) for k, vals in kvs.iteritems()
               for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True

    def search_substring(self, s):
        if len(s) == 0:
            return None, 0

        for i in xrange(len(s) + 1):
            if not self.trie.has_keys_with_prefix(s[:i]):
                i -= 1
                break
        if i > 0:
            return (self.trie.get(s[:i]), i)
        else:
            return None, 0

    def search_suffix(self, token):
        suffix_search, suffix_len = self.search_substring(SUFFIX_KEY +
                                                          token[::-1])
        if suffix_len > 0:
            suffix_len -= len(SUFFIX_KEY)
        return suffix_search, suffix_len

    def search_prefix(self, token):
        prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
        if prefix_len > 0:
            prefix_len -= len(PREFIX_KEY)
        return prefix_search, prefix_len

    def basic_filter(self, tokens):
        return super(DictionaryPhraseFilter, self).filter(tokens)

    def filter(self, tokens):
        for p, t, data in self.basic_filter(tokens):
            if not p:
                t, c = t
                token = t
                token_len = len(token)

                suffix_search, suffix_len = self.search_suffix(token)
                if suffix_search and self.trie.get(
                        token[(token_len - suffix_len):].rstrip('.')):
                    yield ([(t, c)], PHRASE, suffix_len,
                           map(safe_decode, suffix_search))
                    continue
                prefix_search, prefix_len = self.search_prefix(token)
                if prefix_search and self.trie.get(token[:prefix_len]):
                    yield ([(t, c)], PHRASE, prefix_len,
                           map(safe_decode, prefix_search))
                    continue
            else:
                c = PHRASE
            yield t, c, len(t), map(safe_decode, data)
示例#19
0
def test_bytestrie():
    btrie = BytesTrie()
    btrie.mmap('tests/simple.bytestrie')
    print 'start bytes trie-------'
    print btrie.get('foo')
    print 'end bytes trie-------'
示例#20
0
        for ch in "1" + string.lowercase:  # '1' for #
            url = "http://www.noslang.com/{}/{}".format(resource, ch)
            print("Processing " + url)
            r = requests.get(url)
            if not r.ok:
                print("Skipping {} (status code {})".format(ch, r.status_code),
                      file=sys.stderr)

            page = html.fromstring(r.text)
            for abbr in page.cssselect("abbr"):
                a = abbr.getprevious()
                definition = abbr.attrib["title"].lower()
                if definition in fucking_shit:
                    definition = fucking_shit[definition]
                else:
                    for stars, replacement in fucking_shit.iteritems():
                        definition = definition.replace(stars, replacement)

                yield a.attrib["name"].decode("utf-8"), definition


if __name__ == "__main__":
    try:
        [path] = sys.argv[1:]
    except ValueError:
        print("Usage: [prog] path/to/trie", file=sys.stderr)
        sys.exit(1)

    abbr = BytesTrie(iter_noslang())
    abbr.save(path)
示例#21
0
class DictionaryPhraseFilter(PhraseFilter):
    serialize = safe_encode
    deserialize = safe_decode

    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries
        self.canonicals = {}

        kvs = defaultdict(OrderedDict)

        for language in address_phrase_dictionaries.languages:
            for dictionary_name in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in dictionary_name
                is_prefix_dictionary = 'prefixes' in dictionary_name

                for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(language, dictionary_name, canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)

    def serialize(self, s):
        return s

    def deserialize(self, s):
        return s

    def search_substring(self, s):
        if len(s) == 0:
            return None, 0

        for i in xrange(len(s) + 1):
            if not self.trie.has_keys_with_prefix(s[:i]):
                i -= 1
                break
        if i > 0:
            return (self.trie.get(s[:i]), i)
        else:
            return None, 0

    def search_suffix(self, token):
        suffix_search, suffix_len = self.search_substring(SUFFIX_KEY + token[::-1])
        if suffix_len > 0:
            suffix_len -= len(SUFFIX_KEY)
        return suffix_search, suffix_len

    def search_prefix(self, token):
        prefix_search, prefix_len = self.search_substring(PREFIX_KEY + token)
        if prefix_len > 0:
            prefix_len -= len(PREFIX_KEY)
        return prefix_search, prefix_len

    def basic_filter(self, tokens):
        return super(DictionaryPhraseFilter, self).filter(tokens)

    def filter(self, tokens):
        for p, t, data in self.basic_filter(tokens):
            if not p:
                t, c = t
                token = t
                token_len = len(token)

                suffix_search, suffix_len = self.search_suffix(token)
                if suffix_search and self.trie.get(token[(token_len - suffix_len):].rstrip('.')):
                    yield ([(t, c)], token_types.PHRASE, suffix_len, map(safe_decode, suffix_search))
                    continue
                prefix_search, prefix_len = self.search_prefix(token)
                if prefix_search and self.trie.get(token[:prefix_len]):
                    yield ([(t, c)], token_types.PHRASE, prefix_len, map(safe_decode, prefix_search))
                    continue
            else:
                c = token_types.PHRASE
            yield t, c, len(t), map(safe_decode, data)

    def gen_phrases(self, s, canonical_only=False, languages=None):
        tokens = tokenize(s)
        norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]

        if not languages:
            languages = None
        elif not hasattr(languages, '__iter__'):
            languages = [languages]

        if not hasattr(languages, '__contains__'):
            languages = set(languages)

        for t, c, length, data in self.filter(norm_tokens):
            if c == token_types.PHRASE:
                if not canonical_only and languages is None:
                    yield six.u(' ').join([t_i for t_i, c_i in t])
                else:
                    phrase = None
                    for d in data:
                        lang, dictionary, is_canonical, canonical = d.split(six.b('|'))

                        if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'):
                            phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t])
                            yield phrase

    def string_contains_phrases(self, s, canonical_only=False, languages=None):
        phrases = self.gen_phrases(s, canonical_only=canonical_only, languages=languages)
        try:
            phrases.next()
            return True
        except StopIteration:
            return False

    def extract_phrases(self, s, canonical_only=False, languages=None):
        return set(self.gen_phrases(s, canonical_only=canonical_only, languages=languages))