def convert_from_conllu(input_filename, output_filename, with_forth_column=False, with_punct=True, add_number=False): with open(input_filename, "r", encoding='utf-8') as r, open(output_filename, "w", encoding='utf-8') as w: i = 0 for line in r: if line[0] == "#" or line[0] == "=": continue if line == "\n": w.write("\n") i = 0 continue records = line.split("\t") pos = records[3] if with_forth_column: gram = records[5] else: gram = records[4] gram = process_gram_tag(gram) if pos == "PUNCT" and not with_punct: continue if add_number: i += 1 w.write("\t".join( [str(i), records[1], records[2].lower(), pos, gram]) + "\n") else: w.write("\t".join( [records[1], records[2].lower(), pos, gram]) + "\n")
def add_grammemes(self, pos_tag: str, gram: str) -> int: """ Добавить новое грамматическое значение в список известных """ gram = process_gram_tag(gram) vector_name = pos_tag + '#' + gram if vector_name not in self.name_to_index: self.name_to_index[vector_name] = len(self.name_to_index) self.all_grammemes["POS"].add(pos_tag) gram = gram.split("|") if gram != "_" else [] for grammeme in gram: category = grammeme.split("=")[0] value = grammeme.split("=")[1] self.all_grammemes[category].add(value) return self.name_to_index[vector_name]
def __process_line(self, line: str) -> None: """ Обработка строчки в корпусе с морфоразметкой. :param line: :return: """ text, lemma, pos_tag, grammemes = line.strip().split("\t")[0:4] # Заполняем набор возможных выходных тегов. self.grammeme_vectorizer_output.add_grammemes(pos_tag, grammemes) # Заполняем набор возможных входных тегов. for parse in self.morph.parse(text): pos, gram = convert_from_opencorpora_tag(self.converter, parse.tag, text) gram = process_gram_tag(gram) self.grammeme_vectorizer_input.add_grammemes(pos, gram)
def __get_lemma(self, word: str, pos_tag: str, gram: str, word_forms=None, enable_normalization: bool=True): """ Получить лемму. :param word: слово. :param pos_tag: часть речи. :param gram: граммаическое значение. :param enable_normalization: использовать ли нормализацию как в корпусе ГИКРЯ. :return: лемма. """ if '_' in word: return word if self.language == "ru": if word_forms is None: word_forms = self.morph.parse(word) guess = "" max_common_tags = 0 for word_form in word_forms: word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag(self.converter, word_form.tag, word) word_form_gram = process_gram_tag(word_form_gram) common_tags_len = len(set(word_form_gram.split("|")).intersection(set(gram.split("|")))) if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag: max_common_tags = common_tags_len guess = word_form if guess == "": guess = word_forms[0] if enable_normalization: lemma = self.__normalize_for_gikrya(guess) else: lemma = guess.normal_form return lemma elif self.language == "en": lemmatizer = nltk.stem.WordNetLemmatizer() pos_map = defaultdict(lambda: 'n') pos_map.update({ 'ADJ': 'a', 'ADV': 'r', 'NOUN': 'n', 'VERB': 'v' }) return lemmatizer.lemmatize(word, pos=pos_map[pos_tag]) else: assert False
def get_sample(sentence: List[str], morph: pymorphy2.MorphAnalyzer, grammeme_vectorizer: GrammemeVectorizer, max_word_len: int): """ Получние признаков для отдельного предложения. :param sentence: предложение. :param morph: морфология. :param grammeme_vectorizer: грамматический словарь. :param max_word_len: количество обрабатываемых букв в слове. :return: индексы слов, грамматические векторы, индексы символов. """ to_ud = converters.converter('opencorpora-int', 'ud14') word_char_vectors = [] word_gram_vectors = [] for word in sentence: char_indices = np.zeros(max_word_len) gram_value_indices = np.zeros(grammeme_vectorizer.grammemes_count()) # Индексы символов слова. word_char_indices = [CHAR_SET.index(ch) if ch in CHAR_SET else len(CHAR_SET) for ch in word][:max_word_len] char_indices[-min(len(word), max_word_len):] = word_char_indices word_char_vectors.append(char_indices) # Грамматический вектор слова. # Складываем все возможные варианты разбора поэлементно. for parse in morph.parse(word): pos, gram = convert_from_opencorpora_tag(to_ud, parse.tag, word) gram = process_gram_tag(gram) gram_value_indices += np.array(grammeme_vectorizer.get_vector(pos + "#" + gram)) # Нормируем по каждой категории отдельно. sorted_grammemes = sorted(grammeme_vectorizer.all_grammemes.items(), key=lambda x: x[0]) index = 0 for category, values in sorted_grammemes: mask = gram_value_indices[index:index+len(values)] s = sum(mask) gram_value_indices[index:index+len(values)] = mask/s index += len(values) word_gram_vectors.append(gram_value_indices) return word_gram_vectors, word_char_vectors
def convert_from_conllu(input_filename, output_filename, with_forth_column=False, with_punct=True): with open(input_filename, "r") as r: with open(output_filename, "w") as w: for line in r: if line[0] == "#" or line[0] == "=": continue if line == "\n": w.write("\n") continue records = line.split("\t") pos = records[3] if with_forth_column: gram = records[5] else: gram = records[4] gram = process_gram_tag(gram) if pos == "PUNCT" and not with_punct: continue w.write("\t".join( [records[1], records[2].lower(), pos, gram]) + "\n")
def __process_line(self, line: str): """ Обработка строчки в корпусе с морфоразметкой. :param line: :return: """ text, lemma, pos_tag, grammemes = line.strip().split("\t")[0:4] # Заполняем словарь. self.word_vocabulary.add_word(text.lower()) # Заполняем набор символов self.char_set |= {ch for ch in text} # Заполняем набор возможных выходных тегов. self.grammeme_vectorizer_output.add_grammemes(pos_tag, grammemes) # Заполняем набор возможных входных тегов. if self.language == "ru": for parse in self.morph.parse(text): pos, gram = convert_from_opencorpora_tag( self.converter, parse.tag, text) gram = process_gram_tag(gram) self.grammeme_vectorizer_input.add_grammemes(pos, gram) elif self.language == "en": _, tags = zip(*nltk.pos_tag([text], tagset='universal')) pos = tags[0] self.grammeme_vectorizer_input.add_grammemes(pos, "_")
def __get_lemma(self, word: str, pos_tag: str, gram: str, enable_gikrya_normalization: bool = True): """ Получить лемму. :param word: слово. :param pos_tag: часть речи. :param gram: граммаическое значение. :param enable_gikrya_normalization: использовать ли нормализацию как в корпусе ГИКРЯ. :return: лемма. """ if '_' in word: return word to_ud = converters.converter('opencorpora-int', 'ud14') guess = "" max_common_tags = 0 for word_form in self.morph.parse(word): word_form_pos_tag, word_form_gram = convert_from_opencorpora_tag( to_ud, word_form.tag, word) word_form_gram = process_gram_tag(word_form_gram) common_tags_len = len( set(word_form_gram.split("|")).intersection( set(gram.split("|")))) if common_tags_len > max_common_tags and word_form_pos_tag == pos_tag: max_common_tags = common_tags_len guess = word_form if guess == "": guess = self.morph.parse(word)[0] if enable_gikrya_normalization: lemma = self.__normalize_for_gikrya(guess) else: lemma = guess.normal_form return lemma
def get_index_by_name(self, name): pos = name.split("#")[0] gram = process_gram_tag(name.split("#")[1]) return self.name_to_index[pos + "#" + gram]
def get_sample(sentence: List[str], language: str, converter, morph: MorphAnalyzer, grammeme_vectorizer: GrammemeVectorizer, max_word_len: int, word_vocabulary: WordVocabulary, word_count: int, char_set: str): """ Получние признаков для отдельного предложения. :param language: язык. :param sentence: предложение. :param morph: морфология. :param converter: конвертер тегов в UD. :param grammeme_vectorizer: грамматический словарь. :param max_word_len: количество обрабатываемых букв в слове. :param word_vocabulary: список слов. :param word_count: максимальный индекс слова. :param char_set: список возможных символов, для которых есть эмбеддинги. :return: индексы слов, грамматические векторы, индексы символов. """ word_char_vectors = [] word_gram_vectors = [] word_indices = [] for word in sentence: char_indices = np.zeros(max_word_len) gram_value_indices = np.zeros( grammeme_vectorizer.grammemes_count()) # Индексы символов слова. word_char_indices = [ char_set.index(ch) if ch in char_set else len(char_set) for ch in word ][-max_word_len:] char_indices[-min(len(word), max_word_len):] = word_char_indices word_char_vectors.append(char_indices) # Индексы слов. word_index = word_vocabulary.word_to_index[ word.lower()] if word_vocabulary.has_word(word) else word_count word_index = min(word_index, word_count) word_indices.append(word_index) # Грамматический вектор слова. if language == "ru": # Складываем все возможные варианты разбора поэлементно. for parse in morph.parse(word): pos, gram = convert_from_opencorpora_tag( converter, parse.tag, word) gram = process_gram_tag(gram) gram_value_indices += np.array( grammeme_vectorizer.get_vector(pos + "#" + gram)) elif language == "en": _, tags = zip(*nltk.pos_tag([word], tagset='universal')) pos = tags[0] gram_value_indices += np.array( grammeme_vectorizer.get_vector(pos + "#_")) # Нормируем по каждой категории отдельно. sorted_grammemes = sorted( grammeme_vectorizer.all_grammemes.items(), key=lambda x: x[0]) index = 0 for category, values in sorted_grammemes: mask = gram_value_indices[index:index + len(values)] s = sum(mask) gram_value_indices[index:index + len(values)] = mask / s index += len(values) word_gram_vectors.append(gram_value_indices) return word_indices, word_gram_vectors, word_char_vectors