예제 #1
0
def DictIntoDatrie(dictToDo):
    import datrie
    ALPHABET = u'-АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя'
    directTrie = datrie.BaseTrie(ALPHABET)
    reverseTrie = datrie.BaseTrie(ALPHABET)
    for element in dictToDo:
        directTrie[element] = dictToDo[element]["frequency"]
        reverseTrie[element[::-1]] = dictToDo[element]["frequency"]
    # directTrie.save('directTrie.trie')
    # directTrie.save('reverseTrie.trie')
    return directTrie, reverseTrie
예제 #2
0
    def _load(self):
        print("Loading model", self.name, '...', file=sys.stderr, end='')
        self.model = kenlm.LanguageModel(self.model_file)

        print(" reading raw ARPA data ... ", file=sys.stderr, end='')
        self.id2str, self.unigram_probs, bigrams = get_arpa_data(self.arpa_file)
        self.is_special = np.zeros(len(self.id2str), dtype=bool)
        for i, word in enumerate(self.id2str):
            assert self.model.vocab_index(word) == i, i
            if word[0] not in string.ascii_lowercase:
                self.is_special[i] = True
        # Since we give rare-word bonuses, count special words as super-common.
        self.unigram_probs_wordsonly = self.unigram_probs.copy()
        self.unigram_probs_wordsonly[self.is_special] = 0
        # ... but for finding the most common fallback words, count special words as impossible.
        unigram_probs_wordsonly_2 = self.unigram_probs.copy()
        unigram_probs_wordsonly_2[self.is_special] = -np.inf
        self.most_common_words_by_idx = np.argsort(unigram_probs_wordsonly_2)[-500:]
        print(" Encoding bigrams to indices... ", file=sys.stderr, end='')
        self.unfiltered_bigrams, self.filtered_bigrams = encode_bigrams(bigrams, self.model)

        # Vocab trie
        self.vocab_trie = datrie.BaseTrie(set(itertools.chain.from_iterable(self.id2str)))
        for i, s in enumerate(self.id2str):
            self.vocab_trie[s] = i

        self.eos_idx = self.model.vocab_index('</S>')
        self.eop_idx = self.model.vocab_index('</s>')
        print("Loaded.", file=sys.stderr)
예제 #3
0
 def __init__(self, lang):
     assert lang in ["en", "he"]
     self.lang = lang
     self.normalizer = normalizer(lang)
     self.token_to_titles = defaultdict(list)
     self.token_trie = datrie.BaseTrie(letter_scope)
     self._tfidf_scorer = TfidfScorer()
예제 #4
0
def make_trie(filename):
    valid_chars = string.ascii_lowercase + '*'
    trie = datrie.BaseTrie(valid_chars)
    with open(filename) as f:
        for line in f:
            word = line.strip().decode('utf-8')
            trie[word] = 0
    return trie
예제 #5
0
파일: test_trie.py 프로젝트: zwcdp/datrie
def test_save_load_base():
    fd, fname = tempfile.mkstemp()
    trie = datrie.BaseTrie(alphabet=string.printable)
    trie['foobar'] = 1
    trie['foovar'] = 2
    trie['baz'] = 3
    trie['fo'] = 4
    trie.save(fname)

    trie2 = datrie.BaseTrie.load(fname)
    assert trie2['foobar'] == 1
    assert trie2['baz'] == 3
    assert trie2['fo'] == 4
    assert trie2['foovar'] == 2
예제 #6
0
def test_base_trie_data():
    trie = datrie.BaseTrie(string.printable)
    trie['x'] = 1
    trie['xo'] = 2
    state = datrie.BaseState(trie)
    state.walk('x')

    it = datrie.BaseIterator(state)
    it.next()
    assert it.data() == 1

    state.walk('o')

    it = datrie.BaseIterator(state)
    it.next()
    assert it.data() == 2
예제 #7
0
    def __enter__(self):
        needCreate = False
        if not self.db:
            needCreate = needCreate and not self.dbPath.exists()
            dbDir = self.dbPath.parent
            dbDir.mkdir(parents=True, exist_ok=True)
            self.db = sqlite3.connect(str(self.dbPath))
            if not self.isInitialized():
                self.initDB()

        self.dt = self.loadTrie()
        if self.dt is None:
            self.dt = datrie.BaseTrie(ranges=[("\0", '\U0010ffff')
                                              ])  # whole unicode
            needCreate = True

        self.trieWasModified = False

        if needCreate:
            self.createDB()
            self.save()

        return self
예제 #8
0
파일: test_trie.py 프로젝트: zwcdp/datrie
def test_trie_file_io():
    fd, fname = tempfile.mkstemp()

    trie = datrie.BaseTrie(string.printable)
    trie['foobar'] = 1
    trie['foo'] = 2

    extra_data = ['foo', 'bar']

    with open(fname, "wb", 0) as f:
        pickle.dump(extra_data, f)
        trie.write(f)
        pickle.dump(extra_data, f)

    with open(fname, "rb", 0) as f:
        extra_data2 = pickle.load(f)
        trie2 = datrie.BaseTrie.read(f)
        extra_data3 = pickle.load(f)

    assert extra_data2 == extra_data
    assert extra_data3 == extra_data
    assert trie2['foobar'] == 1
    assert trie2['foo'] == 2
    assert len(trie2) == len(trie)
예제 #9
0
    def __get_trie(self):
        """
        Opens and returns the trie if located on backing storage.
        If the trie does not exist, a new one is created and saved!
        """

        if os.path.exists(self.__vocab_trie_path):
            print "Loading trie..."
            return datrie.BaseTrie.load(self.__vocab_trie_path)
        else:
            print "Trie not found - creating..."
            trie = datrie.BaseTrie(
                string.printable
            )  # Our acceptable characters - everything in string.printable

            for input_line in self.__vocab_handle:
                input_line = input_line.strip().split(',')
                term = unicode(input_line[0])
                frequency = long(input_line[1])

                trie[term] = frequency

            trie.save(self.__vocab_trie_path)
            return trie
예제 #10
0
 def trie_graph(self, lst):
     trie = datrie.BaseTrie(string.ascii_uppercase)
     for l in lst:
         trie[l] = 0
     return trie
예제 #11
0
]

roads = list(roadsDic.keys())
for k in roadsDic.keys():
    roads.extend(roadsDic[k])
#地铁
adminAreas.extend(roads)
#街道
adminAreas.extend(streets)
#楼盘
#adminAreas.extend(estate)
adminAreas = list(set(adminAreas))
#tAdmin=datrie.BaseTrie(ranges=[('\u4e00', '\u9fcb'),('\uf900','\ufad6')])
adminWords = set(''.join(adminAreas))
#建立字典
tAdmin = datrie.BaseTrie(list(adminWords))
tPrice = datrie.BaseTrie("一二两三四五六七八九十百千1234567890")
tTag = datrie.BaseTrie(set(''.join(tags)))

for i, aA in enumerate(adminAreas):
    tAdmin[aA] = i
for r in prices:
    # 键值对应字符串和整数
    if ord(r[0]) > 60:
        tPrice[r] = priceDic[r]
    else:
        tPrice[r] = int(r)
for t in tags:
    tTag[t] = 0

data = {'admin': tAdmin, 'price': tPrice, 'tag': tTag}
예제 #12
0
import time
import timeit
import itertools
import text_example
import memory_profiler
import datrie

if __name__ == "__main__":
    print "RAM at start {:0.1f}MiB".format(memory_profiler.memory_usage()[0])
    # avoid building a temporary list of words in Python, store directly in the
    # Trie
    t0 = time.time()
    chars = set()
    for word in text_example.readers:
        chars.update(word)
    trie = datrie.BaseTrie(chars)

    t1 = time.time()
    print "Created a trie with a dictionary of {} characters in {:0.1f}s".format(
        len(chars), t1 - t0)
    readers = text_example.read_words(text_example.SUMMARISED_FILE)
    for word in readers:
        trie[word] = 0
    t2 = time.time()
    print "RAM after creating trie {:0.1f}MiB, took {:0.1f}s".format(
        memory_profiler.memory_usage()[0], t2 - t1)
    print "The trie contains {} words".format(len(trie))

    assert u'Zwiebel' in trie
    time_cost = sum(
        timeit.repeat(stmt="u'Zwiebel' in trie",
예제 #13
0
def build_trie(dict_file='/usr/share/dict/words'):
    trie = datrie.BaseTrie(string.ascii_lowercase)
    with open(dict_file, 'r') as words:
        filter_dict_into_trie(words, trie)
    return trie
예제 #14
0
 def __init__(self):
     ALPHABET = u'abcdefghijklmnopqrstuvwxyz0123456789()&-., '
     self.trie = datrie.BaseTrie(ALPHABET)