def load_dawg_caches(self, correct_cache_fn, correction_indeces_fn, corrections_list_fn): self.correct_cache = dawg.DAWG().load(correct_cache_fn) self.correction_indeces_cache = \ dawg.IntDAWG().load(correction_indeces_fn) self.corrections_list = cPickle.load(open(corrections_list_fn))
def init(): global surname_list, place_list, org_list, exclude_list global place_trie, org_trie, person_trie, custom_trie, ne_trie person_trie = load_ne_from_file(nr_file, nr_save, "PER") place_trie = load_ne_from_file(ns_file, ns_save, "LOC") org_trie = load_ne_from_file(nt_file, nt_save, "ORG") custom_trie = load_ne_from_file(custom_file, custom_save, "ALL") ne_list = {} get_ne(nr_file, "PER", ne_list) get_ne(ns_file, "LOC", ne_list) get_ne(nt_file, "ORG", ne_list) ne_trie = dawg.IntDAWG(zip(ne_list.keys(), ne_list.values())) if os.path.exists(surname_save): surname_list = cPickle.load(open(surname_save, "r")) else: f = open(surname_file, "r") for line in f.readlines(): seg = line.strip().decode("utf8").split() if not seg: continue surname = seg[0] if len(surname) == 1: surname_list["single"].append(surname) else: surname_list["double"].append(surname) f.close() cPickle.dump(surname_list, open(surname_save, "w")) place_list = load_list_from_file(place_file, place_save) org_list = load_list_from_file(org_file, org_save) exclude_list = load_list_from_file(exclude_file, exclude_save)
def train_unigram(corpus): """ Обучение униграммной модели """ freqs = defaultdict(int) # Словарь частот # Чтение из файла корпуса with codecs.open(pl(corpus), "r", encoding="UTF8") as fin: for line in fin: line = line.strip() if line == u"<S>": freqs[u"*START*"] += 1 continue if line == u"</S>": freqs[u"*STOP*"] += 1 continue if line.strip() == "" or len(line.split("\t")) <= 2: continue lem = line.split("\t")[1] if lem != "": freqs[line.split("\t")[1]] += 1 d = dawg.IntDAWG([(word, freq) for word, freq in freqs.iteritems()]) d.save(pl(corpus + "_freqs_1.dawg")) # Сериализация with open(pl(corpus + "_1_sum.pkl"), "wb") as fout: # Запоминаем количество слов в корпусе pickle.dump(sum(freqs.values()), fout, pickle.HIGHEST_PROTOCOL) print corpus, "unigram saved" return True
def load_from_files(pickle_fn, dawg_fn, prefix_dawg_fn): entity_db = cPickle.load(open(pickle_fn, "rb")) entity_db.dawg = dawg.IntCompletionDAWG() entity_db.dawg.load(dawg_fn) entity_db.long_entities = dawg.IntDAWG() entity_db.long_entities.load(prefix_dawg_fn) return entity_db
def train_collocations(corpus): """ Сбор коллокаций по корпусу (по несмежным биграммам слов, не разделенных никакими знаками препинания, кроме тире). (Обучение коллокационной модели) """ bigrams = unpkl_1layered_i(pl(corpus + ".bigrams.pkl")) unigrams = unpkl_1layered_i(pl(corpus + ".unigrams.pkl")) collocs = defaultdict(int) N = sum(unigrams.values()) for bigram, freq in bigrams.iteritems(): try: collocs[bigram] = MI(freq, unigrams[bigram[0]], unigrams[bigram[1]], N) except Exception: print freq, unigrams[bigram[0]], unigrams[bigram[1]], N continue # Тестовый вывод в файл with codecs.open(pl(corpus + ".collocs"), "w", encoding="UTF8") as fout: fout.write("Collocate_1\tCollocate_2\tFreqs\tMI\n") for bigram, mi in sorted(collocs.iteritems(), key=lambda x: x[1], reverse=True): fout.write(u"{0}\t{1:d}\t{2:.4f}\n".format( "\t".join(bigram), bigrams[bigram], mi)) dcollocs = dawg.IntDAWG([("|".join(bigram), int(freq * 100)) for bigram, freq in collocs.iteritems()]) dcollocs.save(pl(corpus + ".collocs.dawg")) print "Collocations saved" return True
def build_test_data(): dawg.CompletionDAWG(['f', 'bar', 'foo', 'foobar']).save('dev_data/small/completion.dawg') dawg.CompletionDAWG([]).save('dev_data/small/completion-empty.dawg') bytes_data = (('foo', b'data1'), ('bar', b'data2'), ('foo', b'data3'), ('foobar', b'data4')) dawg.BytesDAWG(bytes_data).save('dev_data/small/bytes.dawg') record_data = (('foo', (3, 2, 256)), ('bar', (3, 1, 0)), ('foo', (3, 2, 1)), ('foobar', (6, 3, 0))) dawg.RecordDAWG(str(">3H"), record_data).save('dev_data/small/record.dawg') int_data = {'foo': 1, 'bar': 5, 'foobar': 3} dawg.IntDAWG(int_data).save('dev_data/small/int_dawg.dawg') dawg.IntCompletionDAWG(int_data).save( 'dev_data/small/int_completion_dawg.dawg') dawg.DAWG(TestPrediction.DATA).save('dev_data/small/prediction.dawg') dawg.RecordDAWG(str("=H"), [ (k, (len(k), )) for k in TestPrediction.DATA ]).save('dev_data/small/prediction-record.dawg') create_dawg().save('dev_data/large/dawg.dawg') create_bytes_dawg().save('dev_data/large/bytes_dawg.dawg') create_record_dawg().save('dev_data/large/record_dawg.dawg') create_int_dawg().save('dev_data/large/int_dawg.dawg')
def load_ne_from_file(fname, sname, tag): if os.path.exists(sname): temp_trie = cPickle.load(open(sname, "r")) else: temp_ne = get_ne(fname, tag) temp_trie = dawg.IntDAWG(zip(temp_ne.keys(), temp_ne.values())) cPickle.dump(temp_trie, open(sname, "w")) return temp_trie
def finalize_long_entities(self): logging.info("Creating prefix dawg...") self.long_values = {} self.long_entities = dawg.IntDAWG( (p, self.long_values.setdefault(frozenset(full), len(self.long_values))) for p, full in self.long_entities.iteritems()) self.long_values = [k for k, _ in sorted(self.long_values.iteritems(), key=lambda x: x[1])]
def train_ngram(corpus, n): """ Обучение N-граммной модели (2- или 3-) """ if n == 3: freqs = defaultdict(lambda: defaultdict(lambda: defaultdict(int)) ) # Словарь частот elif n == 2: freqs = defaultdict(lambda: defaultdict(int)) else: raise ValuError("Wrong n parameter for N-gram model training!") # Чтение из файла корпуса with codecs.open(pl(corpus), "r", encoding="UTF8") as fin: for line in fin: line = line.strip() if line == u"<S>": buff = ["*START*"] * n continue if line == u"</S>": # Конец предложения buff.append(u"*STOP*") Synonymizer.add_counts(freqs, buff, n) if line == "" or len(line.split("\t")) <= 2: # Знак препинания continue # Накапливаем частоты лемм lem = line.split("\t")[1] if lem != "": buff.append(lem) if n == 3: d = dawg.IntDAWG([("|".join((word1, word2, word3)), freq) for word1, v1 in freqs.iteritems() for word2, v2 in v1.iteritems() for word3, freq in v2.iteritems()]) elif n == 2: d = dawg.IntDAWG([("|".join((word1, word2)), freq) for word1, v1 in freqs.iteritems() for word2, freq in v1.iteritems()]) d.save(pl(corpus + "_freqs_" + str(n) + ".dawg")) # Сериализация print corpus, n, "gram saved" return True
def test_contains(): d = dawg.IntDAWG({'foo': 1, 'bar': 2, 'foobar': 3}) assert 'foo' in d assert 'bar' in d assert 'foobar' in d assert 'fo' not in d assert 'x' not in d assert b'foo' in d assert b'x' not in d
def buildDAWG(filename): fp = open(filename,'r') my_list = [] for each in fp: line = each.strip().split(' ') print(line) my_list.append((line[0],int(line[1]))) my_dawg = dawg.IntDAWG(my_list) with open(PINYINDAWG,'wb') as f: my_dawg.write(f)
def buildDAWG(filename, savename): fp = open(filename, 'r') result = json.load(fp) my_dict = {} for each in result: if "W" in each: my_dict.update(result[each]) my_list = [] index = 0 for each in my_dict: my_list.append((each, my_dict[each])) index += 1 my_list.append(("__total__", index)) my_dawg = dawg.IntDAWG(my_list) with open(savename, 'wb') as f: my_dawg.write(f)
def train_bayes(corpus, n): """ Обучение контекстной модели для наивного байесовского классификатора """ syns = {} with open(pl("total.pkl"), "rb") as fin: # Подгружаем словарь синонимов syns = pickle.load(fin) all_syns = set(syns.keys()) all_syns = all_syns.union( set(itertools.chain.from_iterable(syns.values()))) # Все синонимы contexts = defaultdict(lambda: defaultdict(int)) # Словарь контекстов sentence = [] # Чтение из файла корпуса и запись в выходной файл with codecs.open(pl(corpus), "r", encoding="UTF8") as fin: for line in fin: line = line.strip() if line == u"<S>": sentence = [u"*START*"] continue if line == u"</S>": sentence.append(u"*STOP*") Synonymizer.add_contexts(all_syns, contexts, sentence, n) sentence = [] if line == "" or len(line.split("\t")) <= 2: continue sentence.append(line.split("\t")[1]) d = dawg.IntDAWG([("|".join((word1, word2)), freq) for word1, v1 in contexts.iteritems() for word2, freq in v1.iteritems()]) d.save(pl(corpus + "_contexts_" + str(n) + ".dawg")) # Сериализация словарей print corpus, "bayes saved" return True
def get_freqs(self, freq_file_path): if self.freq_struct_is_dawg: self.freqs = dawg.IntDAWG().load(freq_file_path) else: self.freqs = cPickle.load(open(freq_file_path))
def __init__(self, mode='one', debug='no'): if debug == 'no': logging.basicConfig() else: logging.basicConfig(level=logging.DEBUG) self.logger = logging.getLogger() self.logger.debug('Setting up the Accentor...') self.mode = mode self.__all_russian_letters = { 'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я' } self.__russian_vowels = { 'а', 'о', 'у', 'э', 'ы', 'и', 'я', 'ё', 'ю', 'е' } self.__homonyms = None self.__simple_words_dawg = None self.__function_words = None self.__new_homonyms = {} self.__new_simple_words = set() self.__bad_words = [] self.__re_for_morphosplit = re.compile(r'[\,\s\|]+', re.U) self.__re_for_morphotag = re.compile(r'^(\w+|\w+[\-\=]\w+)$', re.U) assert mode in ('one', 'many'), 'Set either "one" or "many" variant mode!' assert debug in ('yes', 'no'), 'Set either "yes" or "no" variant mode!' homograph_dictionary_name = os.path.join(os.path.dirname(__file__), 'data', 'homographs.json') assert os.path.isfile( homograph_dictionary_name), 'File `{0}` does not exist!'.format( homograph_dictionary_name) simple_words_dawg_name = os.path.join(os.path.dirname(__file__), 'data', 'simple_words.dawg') assert os.path.isfile( simple_words_dawg_name), 'File `{0}` does not exist!'.format( simple_words_dawg_name) function_words_name = os.path.join(os.path.dirname(__file__), 'data', 'Function_words.json') assert os.path.isfile( function_words_name), 'File `{0}` does not exist!'.format( function_words_name) data = None try: d = dawg.IntDAWG() self.__simple_words_dawg = d.load(simple_words_dawg_name) ### with codecs.open(homograph_dictionary_name, mode='r', encoding='utf-8', errors='ignore') as fp: data = json.load(fp) error_message_homographs = 'File `{0}` contains incorrect data!'.format( homograph_dictionary_name) assert isinstance(data, dict), error_message_homographs self.__homonyms = dict() for cur_wordform in data: assert self.check_source_wordform(cur_wordform), \ error_message_homographs + ' Word `{0}` is inadmissible!'.format(cur_wordform) assert (cur_wordform not in self.__homonyms) and (cur_wordform.lower() not in self.__simple_words_dawg), \ error_message_homographs + ' Word `{0}` is repeated!'.format(cur_wordform) assert isinstance(data[cur_wordform], dict), \ error_message_homographs + ' Word `{0}` has incorrect description of accents!'.format(cur_wordform) for cur_key in data[cur_wordform]: assert self.check_morphotag(cur_key), \ error_message_homographs + ' Word `{0}` has incorrect description of accents!'.format(cur_wordform) assert self.check_accented_wordform(data[cur_wordform][cur_key]), \ error_message_homographs + ' Word `{0}` has incorrect description of accents!'.format(cur_wordform) values = [data[cur_wordform][it] for it in data[cur_wordform]] self.__homonyms[cur_wordform] = copy.deepcopy( data[cur_wordform]) ### self.__function_words = None with codecs.open(function_words_name, mode='r', encoding='utf-8', errors='ignore') as fp: function_words = json.load(fp) error_message_function_words = 'File `{0}` contains incorrect data!'.format( function_words_name) assert isinstance(function_words, list), error_message_function_words assert isinstance(function_words[0], str), error_message_function_words self.__function_words = function_words finally: if data is not None: del data
def init(): global ne_list, surname_list, place_list, org_list, trie, place_trie, org_trie, person_trie global kw_list1, kw_list2, rule_list if os.path.exists(ne_save): ne_list = cPickle.load(open(ne_save, "r")) else: get_ne(ns_file, "ns") get_ne(nr_file, "nr") get_ne(nt_file, "nt") ne_list = collections.OrderedDict(sorted(ne_list.items())) # save NEs: cPickle.dump(ne_list, open(ne_save, "w")) print len(ne_list.keys()) trie = dawg.IntDAWG(zip(ne_list.keys(), ne_list.values())) # trie = dawg.IntCompletionDAWG(zip(ne_list.keys(), ne_list.values())) place_ne = {k: v for k, v in ne_list.items() if v == 1} place_trie = dawg.IntDAWG(zip(place_ne.keys(), place_ne.values())) org_ne = {k: v for k, v in ne_list.items() if v == 2} org_trie = dawg.IntDAWG(zip(org_ne.keys(), org_ne.values())) person_ne = {k: v for k, v in ne_list.items() if v == 0} person_trie = dawg.IntDAWG(zip(person_ne.keys(), person_ne.values())) # load second-phase dicts. if os.path.exists(surname_save): surname_list = cPickle.load(open(surname_save, "r")) else: f = open(surname_file, "r") for line in f.readlines(): seg = line.strip().decode("utf8").split() if not seg: continue surname = seg[0] if len(surname) == 1: surname_list["single"].append(surname) else: surname_list["double"].append(surname) f.close() cPickle.dump(surname_list, open(surname_save, "w")) if os.path.exists(place_save): place_list = cPickle.load(open(place_save, "r")) # print place_list else: f = open(place_file, "r") for line in f.readlines(): seg = line.strip().decode("utf8").split() if not seg: continue place = seg[0] place_list.append(place) f.close() cPickle.dump(place_list, open(place_save, "w")) if os.path.exists(org_save): org_list = cPickle.load(open(org_save, "r")) else: f = open(org_file, "r") for line in f.readlines(): seg = line.strip().decode("utf8").split() if not seg: continue org = seg[0] org_list.append(org) f.close() cPickle.dump(org_list, open(org_save, "w")) sw = open(stopword_file, "r") for line in sw.readlines(): stopwords_list.append(line.strip()) sw.close() kw = open(kw_file1, "r") for line in kw.readlines(): kw_list1.append(line.strip().decode("utf8")) kw.close() kw = open(kw_file2, "r") for line in kw.readlines(): kw_list2.append(line.strip().decode("utf8")) kw.close() return trie
def __enter__(self): self.lookup = dawg.IntDAWG() self.lookup.load(self.path) return self
def loaddict(f_index='pyindex.dawg', f_essay='essay.dawg'): global essay, p_index p_index = dawg.BytesDAWG() p_index.load(f_index) essay = dawg.IntDAWG() essay.load(f_essay)
def __init__(self, morph, params): """ Инициализация синонимизатора """ args = set(params) self.morph = morph # Морфология pymorphy self.UseIdioms = True if "-i" in args else False self.UseBayes = True if "-b" in args else False self.UseDisambig = True if "-disamb" in args else False self.UseNGram = True if "-n" in args else False self.UseDetails = True if "-d" in args else False self.UseOnlySyn = True if "-os" in args else False self.UseViterbi = True if "-v" in args else False self.UseCollocs = True if "-col" in args else False # Выводим параметры, с которыми запущен синонимизатор (если необходимо) if self.UseDetails: print "<font color='blue'>" + " ".join(params) + "</font><br><br>" # "Уровень" подбора синонимов: 1 - Точный, 2 - Средний, 3 - Низкий, с макс. синонимизацией Level = 3 if "-level" in args: try: Level = int(sys.argv[sys.argv.index("-level") + 1]) except Exception: Level = 3 # Имя файла со словарем синонимов if "-dict" in args: try: synfile = sys.argv[sys.argv.index("-dict") + 1] except Exception: synfile = "base.pkl" else: raise IOError("Synonims dictionary not found!") # Пороги отсечения вероятностей в зависимости от "уровня" - для N-граммной модели SMALL_PROBS = {1: 0.5, 2: 0.2, 3: 0} # Максимальное допустимое число равноценных вариантов синонимов в зависимости от "уровня" VARS_COUNT = {1: 3, 2: 5, 3: 10} self.vars_count = VARS_COUNT[Level] self.small_prob = SMALL_PROBS[Level] (self.samplefile, corpusfile) = params[1:3] # Проверка параметров: метод синонимизации if sum( map(bool, (self.UseBayes, self.UseNGram, self.UseViterbi, self.UseCollocs))) != 1: raise ValueError( "Choose only one of the three methods: Bayesian, Ngram or Viterbi!!!" ) self.actions = { self.UseNGram: self.calc_ngram_sent, self.UseBayes: self.calc_bayes_sent, self.UseViterbi: self.calc_viterbi_sent, self.UseCollocs: self.calc_colloc_sent } # Подгружаем сериализованный словарь синонимов. Формат словаря: {лексема: множество лексем-синонимов}. self.syns = unpkl_1layered_sets(pl(synfile)) # Подгружаем словарь идиом, если необходимо. Формат: множество. if self.UseIdioms: self.idioms = set() with codecs.open(pl("idioms.txt.lemma"), "r", encoding="UTF8") as fin: for line in fin: self.idioms.add(line.strip()) # Подгружаем коллокации, если необходимо if self.UseCollocs: self.collocs = dawg.IntDAWG() self.collocs.load(pl(corpusfile + ".collocs.dawg")) self.posfreqs = unpkl_3layered_f(pl(corpusfile + ".pos.pkl")) # Подгружаем частоты униграмм и общее число слов в корпусе if not self.UseCollocs: self.freqs = dawg.IntDAWG() self.freqs.load(pl(corpusfile + "_freqs_1.dawg")) self.f_sum = 0 with open(pl(corpusfile + "_1_sum.pkl"), "rb") as fin: self.f_sum = pickle.load(fin) # Подгружаем список контекстов, если необходимо if self.UseBayes: self.N = 5 self.contexts = dawg.IntDAWG() self.contexts.load(pl(corpusfile + "_contexts_5.dawg")) # Подгружаем частоты n-грамм, если необходимо if self.UseNGram or self.UseViterbi: self.N = 3 (self.freqs2, self.freqs3) = (dawg.IntDAWG(), dawg.IntDAWG()) self.freqs2.load(pl(corpusfile + "_freqs_2.dawg")) self.freqs3.load(pl(corpusfile + "_freqs_3.dawg")) if self.UseCollocs: self.indexed_wsyns = defaultdict( str) # Словарь типа {номер токена: синоним} else: self.indexed_syns = defaultdict( str) # Словарь типа {номер токена: синоним}
def create_int_dawg(): words = words100k() values = [len(word) for word in words] return dawg.IntDAWG(zip(words, values))
def __init__(self, mode='one', debug='no', exception_for_unknown=False, use_wiki=True): """Create an object for adding accents. Args: mode (str, optional): [description]. Defaults to 'one'. debug (str, optional): Print extra info useful for debugging. Defaults to 'no'. exception_for_unknown (bool, optional): Raise an exception in case of unknown word. Defaults to False. use_wiki (bool, optional): Look for a word in Wiktionary. Defaults to True. """ if debug == 'no': logging.basicConfig() else: logging.basicConfig(level=logging.DEBUG) self.logger = logging.getLogger() self.logger.debug('Setting up the Accentor...') self.mode = mode self.__rus_letters = set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя') self.__rus_vowels = set('аоуэыияёюе') self.exception_for_unknown = exception_for_unknown self.use_wiki = use_wiki self.__homonyms, self.__new_homonyms = {}, {} self.__simple_words_dawg = None # maps unicode key -> int index of accented vowel self.__function_words = None self.__new_simple_words = set() self.__bad_words = [] # Too many accent variants, or unknown accent self.__re_for_morphosplit = re.compile(r'[\,\s\|]+', re.U) self.__re_for_morphotag = re.compile(r'^(\w+|\w+[\-\=]\w+)$', re.U) assert mode in ('one', 'many'), 'Set either "one" or "many" variant mode!' assert debug in ('yes', 'no'), 'Set either "yes" or "no" variant mode!' homograph_dictionary_name = os.path.join(os.path.dirname(__file__), 'data', 'homographs.json') assert os.path.isfile(homograph_dictionary_name), f'File `{homograph_dictionary_name}` does not exist!' simple_words_dawg_name = os.path.join(os.path.dirname(__file__), 'data', 'simple_words.dawg') assert os.path.isfile(simple_words_dawg_name), f'File `{simple_words_dawg_name}` does not exist!' function_words_name = os.path.join(os.path.dirname(__file__), 'data', 'Function_words.json') assert os.path.isfile(function_words_name), f'File `{function_words_name}` does not exist!' data = None try: d = dawg.IntDAWG() self.__simple_words_dawg = d.load(simple_words_dawg_name) with codecs.open(homograph_dictionary_name, mode='r', encoding='utf-8', errors='ignore') as fp: data = json.load(fp) error_message_homographs = f'File `{homograph_dictionary_name}` contains incorrect data!' assert isinstance(data, dict), error_message_homographs for cur_wordform in data: assert self.check_source_wordform(cur_wordform), \ error_message_homographs + f' Word `{cur_wordform}` is inadmissible!' assert (cur_wordform not in self.__homonyms) and \ (cur_wordform.lower() not in self.__simple_words_dawg), \ error_message_homographs + f' Word `{cur_wordform}` is repeated!' assert isinstance(data[cur_wordform], dict), \ error_message_homographs + \ f' Word `{cur_wordform}` has incorrect description of accents!' for cur_key in data[cur_wordform]: assert self.check_morphotag(cur_key), \ error_message_homographs + \ f' Word `{cur_wordform}` has incorrect description of accents!' assert self.check_accented_wordform(data[cur_wordform][cur_key]), \ error_message_homographs + \ f' Word `{cur_wordform}` has incorrect description of accents!' # values = [data[cur_wordform][it] for it in data[cur_wordform]] # FIXME variable `values` is unused self.__homonyms[cur_wordform] = copy.deepcopy(data[cur_wordform]) self.__function_words = None with codecs.open(function_words_name, mode='r', encoding='utf-8', errors='ignore') as fp: function_words = json.load(fp) error_message_function_words = f'File `{function_words_name}` contains incorrect data!' assert isinstance(function_words, list), error_message_function_words assert isinstance(function_words[0], str), error_message_function_words self.__function_words = function_words finally: if data is not None: del data