from chartrie import CharTrie, FrozenCharTrie t = CharTrie() print len(t) t['hello'] = 49 print len(t) t['hello1'] = 50 print len(t) t['hello2'] = 51 print len(t) t['hello3'] = 52 print len(t) t['hello4'] = 53 print len(t) t['help'] = 54 print len(t) t['abc'] = 55 print len(t) t['v'] = 56 print len(t) t['a'] = 57 print len(t) t[''] = 58 print len(t) print 'get:', t[''] print 'get:', t['hel'] t.debug_print() stream = t.dumps() f = open('dump.trie', 'wb') f.write(stream) f.close() d = FrozenCharTrie()
def convert_and_save(self, data_obj): print "Filling lemmas tree..." enc = self.encoding lemmas = CharTrie() paradigm_list = [] paradigm_dict = {} for lemma, paradigms in data_obj.lemmas.iteritems(): if lemma == "#": lemma = "" paradigms.sort() tupar = tuple(paradigms) try: lemma_id = paradigm_dict[tupar] except KeyError: paradigm_list.append(tupar) lemma_id = paradigm_dict[tupar] = len(paradigm_list) - 1 lemmas[lemma.encode(enc)] = lemma_id if DEBUG: print "Lemma trie size:", len(lemmas) print "Paradigm count:", len(paradigm_list) print "Filling suffixes tree..." from collections import defaultdict suffix_dict = defaultdict(list) suffix_count = 0 suffixes = CharTrie() initial_forms = {} for paradigm_id, rules in data_obj.rules.iteritems(): initial_forms[paradigm_id] = rules[0][0] for rule_suffix, rule_ancode, rule_prefix in rules: enc_suffix = rule_suffix.encode(enc)[::-1] suffix_id = suffixes[enc_suffix] if suffix_id is None: suffix_id = suffixes[enc_suffix] = suffix_count suffix_count += 1 suffix_dict[suffix_id, paradigm_id].append((rule_ancode, rule_prefix)) if DEBUG: print "Done." para_count = sum(map(len, data_obj.lemmas.itervalues())) rule_count = sum(map(len, data_obj.rules.itervalues())) rule_count2 = sum(map(len, suffix_dict.itervalues())) assert rule_count == rule_count2 print "Total number of paradigms: %s" % para_count print "Total number of rules: %s" % rule_count print "Total number of rules2: %s" % rule_count2 print "Suffix trie size:", len(suffixes) print "Comb size:", len(suffix_dict) if False: # ultra-hardcore-debug mode from pymorphy.console import reprint for k, v in data_obj.__dict__.iteritems(): if isinstance(v, dict) and len(v) >= 3: print "Length of %s = %s" % (k, len(v)), k2 = v.keys()[0] print "Sample:", reprint((k2, "->", v[k2])) elif isinstance(v, (list, tuple, set)) and len(v) >= 3: print "Length of %s = %s" % (k, len(v)), print "Sample 0:", reprint(list(v)[0]) else: print "Value of %s =" % k, reprint(v) self._save_file("lemmas", lemmas.dumps()) self._save_file("suffixes", suffixes.dumps()) endings_shelve = self._get_shelf("endings", "c", "unicode") for end, value in data_obj.endings.iteritems(): endings_shelve[end] = value endings_shelve.close() rules_shelve = self._get_shelf("rules", "c", "int", "pickle") for rule in data_obj.rules: rules_shelve[rule] = data_obj.rules[rule] rules_shelve.close() misc_shelve = self._get_shelf("misc", "c", "unicode", "pickle") misc_shelve["encoding"] = enc misc_shelve["gramtab"] = data_obj.gramtab misc_shelve["prefixes"] = list(data_obj.prefixes) misc_shelve["possible_rule_prefixes"] = list(data_obj.possible_rule_prefixes) misc_shelve["paradigm_list"] = paradigm_list misc_shelve["suffix_dict"] = suffix_dict misc_shelve["initial_forms"] = initial_forms misc_shelve.close()