Exemplo n.º 1
0
from chartrie import CharTrie, FrozenCharTrie
t = CharTrie()
print len(t)
t['hello'] = 49
print len(t)
t['hello1'] = 50
print len(t)
t['hello2'] = 51
print len(t)
t['hello3'] = 52
print len(t)
t['hello4'] = 53
print len(t)
t['help'] = 54
print len(t)
t['abc'] = 55
print len(t)
t['v'] = 56
print len(t)
t['a'] = 57
print len(t)
t[''] = 58
print len(t)
print 'get:', t['']
print 'get:', t['hel']
t.debug_print()
stream = t.dumps()
f = open('dump.trie', 'wb')
f.write(stream)
f.close()
d = FrozenCharTrie()
Exemplo n.º 2
0
    def convert_and_save(self, data_obj):
        print "Filling lemmas tree..."
        enc = self.encoding
        lemmas = CharTrie()
        paradigm_list = []
        paradigm_dict = {}
        for lemma, paradigms in data_obj.lemmas.iteritems():
            if lemma == "#":
                lemma = ""
            paradigms.sort()
            tupar = tuple(paradigms)
            try:
                lemma_id = paradigm_dict[tupar]
            except KeyError:
                paradigm_list.append(tupar)
                lemma_id = paradigm_dict[tupar] = len(paradigm_list) - 1

            lemmas[lemma.encode(enc)] = lemma_id
        if DEBUG:
            print "Lemma trie size:", len(lemmas)
            print "Paradigm count:", len(paradigm_list)

        print "Filling suffixes tree..."
        from collections import defaultdict

        suffix_dict = defaultdict(list)
        suffix_count = 0
        suffixes = CharTrie()

        initial_forms = {}
        for paradigm_id, rules in data_obj.rules.iteritems():
            initial_forms[paradigm_id] = rules[0][0]
            for rule_suffix, rule_ancode, rule_prefix in rules:
                enc_suffix = rule_suffix.encode(enc)[::-1]
                suffix_id = suffixes[enc_suffix]
                if suffix_id is None:
                    suffix_id = suffixes[enc_suffix] = suffix_count
                    suffix_count += 1
                suffix_dict[suffix_id, paradigm_id].append((rule_ancode, rule_prefix))

        if DEBUG:
            print "Done."
            para_count = sum(map(len, data_obj.lemmas.itervalues()))
            rule_count = sum(map(len, data_obj.rules.itervalues()))
            rule_count2 = sum(map(len, suffix_dict.itervalues()))
            assert rule_count == rule_count2
            print "Total number of paradigms: %s" % para_count
            print "Total number of rules: %s" % rule_count
            print "Total number of rules2: %s" % rule_count2

            print "Suffix trie size:", len(suffixes)
            print "Comb size:", len(suffix_dict)

            if False:  # ultra-hardcore-debug mode
                from pymorphy.console import reprint

                for k, v in data_obj.__dict__.iteritems():
                    if isinstance(v, dict) and len(v) >= 3:
                        print "Length of %s = %s" % (k, len(v)),
                        k2 = v.keys()[0]
                        print "Sample:",
                        reprint((k2, "->", v[k2]))
                    elif isinstance(v, (list, tuple, set)) and len(v) >= 3:
                        print "Length of %s = %s" % (k, len(v)),
                        print "Sample 0:",
                        reprint(list(v)[0])
                    else:
                        print "Value of  %s =" % k,
                        reprint(v)

        self._save_file("lemmas", lemmas.dumps())
        self._save_file("suffixes", suffixes.dumps())

        endings_shelve = self._get_shelf("endings", "c", "unicode")

        for end, value in data_obj.endings.iteritems():
            endings_shelve[end] = value
        endings_shelve.close()

        rules_shelve = self._get_shelf("rules", "c", "int", "pickle")
        for rule in data_obj.rules:
            rules_shelve[rule] = data_obj.rules[rule]
        rules_shelve.close()

        misc_shelve = self._get_shelf("misc", "c", "unicode", "pickle")
        misc_shelve["encoding"] = enc
        misc_shelve["gramtab"] = data_obj.gramtab
        misc_shelve["prefixes"] = list(data_obj.prefixes)
        misc_shelve["possible_rule_prefixes"] = list(data_obj.possible_rule_prefixes)
        misc_shelve["paradigm_list"] = paradigm_list
        misc_shelve["suffix_dict"] = suffix_dict
        misc_shelve["initial_forms"] = initial_forms
        misc_shelve.close()
Exemplo n.º 3
0
from chartrie import CharTrie, FrozenCharTrie
t = CharTrie()
print len(t)
t['hello'] = 49
print len(t)
t['hello1'] = 50
print len(t)
t['hello2'] = 51
print len(t)
t['hello3'] = 52
print len(t)
t['hello4'] = 53
print len(t)
t['help'] = 54
print len(t)
t['abc'] = 55
print len(t)
t['v'] = 56
print len(t)
t['a'] = 57
print len(t)
t[''] = 58
print len(t)
print 'get:', t['']
print 'get:', t['hel']
t.debug_print()
stream = t.dumps()
f = open('dump.trie', 'wb')
f.write(stream)
f.close()
d = FrozenCharTrie()