예제 #1
0
 def __init__(self):
     self.grammeme_vectorizer = GrammemeVectorizer()
     self.word_form_vocabulary = WordFormVocabulary()
     self.lemma_to_word_forms = defaultdict(
         set)  # type: Dict[str, Set[WordForm]]
     self.lemma_case = {}
     self.lemma_counter = Counter()  # type: Counter
예제 #2
0
 def prepare(
         self,
         filenames: List[str] = list(),
         word_form_vocab_dump_path: str = GENERATOR_WORD_FORM_VOCAB_PATH,
         gram_dump_path: str = GENERATOR_GRAM_VECTORS) -> None:
     """
     Подготовка векторизатора грамматических значений и словаря словоформ по корпусу.
     
     :param filenames: имена файлов с морфоразметкой.
     :param word_form_vocab_dump_path: путь к дампу словаря словоформ.
     :param gram_dump_path: путь к векторам грамматических значений.
     """
     self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path)
     self.word_form_vocabulary = WordFormVocabulary(
         word_form_vocab_dump_path)
     if self.grammeme_vectorizer.is_empty(
     ) or self.word_form_vocabulary.is_empty():
         loader = CorporaInformationLoader()
         self.word_form_vocabulary, self.grammeme_vectorizer = loader.parse_corpora(
             filenames)
         self.grammeme_vectorizer.save()
         self.word_form_vocabulary.save()
     if self.recalculate_softmax:
         self.softmax_size = self.word_form_vocabulary.get_softmax_size_by_lemma_size(
             self.embedding_size)
         print("Recalculated softmax: ", self.softmax_size)
예제 #3
0
파일: loader.py 프로젝트: che1974/rupo
class Loader(object):
    """
    Класс для построения GrammemeVectorizer и WordFormVocabulary по корпусу
    """
    def __init__(self, gram_dump_path, word_dump_path):
        self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path)
        self.word_vocabulary = WordVocabulary(word_dump_path)
        self.morph = pymorphy2.MorphAnalyzer()

    def parse_corpora(
            self,
            filenames: List[str]) -> Tuple[GrammemeVectorizer, WordVocabulary]:
        """
        Построить WordFormVocabulary, GrammemeVectorizer по корпусу

        :param filenames: пути к файлам корпуса.
        """
        for filename in filenames:
            with tqdm_open(filename, encoding="utf-8") as f:
                for line in f:
                    if line == "\n":
                        continue
                    self.__process_line(line)

        self.grammeme_vectorizer.init_possible_vectors()
        return self.grammeme_vectorizer, self.word_vocabulary

    def __process_line(self, line: str) -> None:
        text, lemma, pos_tag, grammemes = line.strip().split("\t")[:4]
        self.word_vocabulary.add_word(text)
        self.grammeme_vectorizer.add_grammemes(pos_tag, grammemes)
        to_ud = converters.converter('opencorpora-int', 'ud14')
        for parse in self.morph.parse(text):
            ud_tag = to_ud(str(parse.tag), text)
            pos = ud_tag.split()[0]
            gram = ud_tag.split()[1].split("|")
            dropped = ["Animacy", "Aspect", "NumType"]
            gram = [
                grammem for grammem in gram
                if sum([drop in grammem for drop in dropped]) == 0
            ]
            gram = "|".join(gram)
            self.grammeme_vectorizer.add_grammemes(pos, gram)
예제 #4
0
파일: loader.py 프로젝트: che1974/rupo
 def __init__(self, gram_dump_path, word_dump_path):
     self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path)
     self.word_vocabulary = WordVocabulary(word_dump_path)
     self.morph = pymorphy2.MorphAnalyzer()
예제 #5
0
class CorporaInformationLoader(object):
    """
    Класс для построения GrammemeVectorizer и WordFormVocabulary по корпусу
    """
    def __init__(self):
        self.grammeme_vectorizer = GrammemeVectorizer()
        self.word_form_vocabulary = WordFormVocabulary()
        self.lemma_to_word_forms = defaultdict(
            set)  # type: Dict[str, Set[WordForm]]
        self.lemma_case = {}
        self.lemma_counter = Counter()  # type: Counter

    def parse_corpora(
            self, filenames: List[str]
    ) -> Tuple[WordFormVocabulary, GrammemeVectorizer]:
        """
        Построить WordFormVocabulary, GrammemeVectorizer по корпусу

        :param filenames: пути к файлам корпуса.
        """
        for filename in filenames:
            with tqdm_open(filename, encoding="utf-8") as f:
                for line in f:
                    if line == "\n":
                        continue
                    self.__process_line(line)

        self.__add_seq_end()
        self.grammeme_vectorizer.init_possible_vectors()
        self.word_form_vocabulary.init_by_vocabulary(self.lemma_counter,
                                                     self.lemma_to_word_forms,
                                                     self.lemma_case)
        self.word_form_vocabulary.lemma_indices[SEQ_END_WF] = 1
        return self.word_form_vocabulary, self.grammeme_vectorizer

    def __add_seq_end(self):
        self.lemma_to_word_forms[SEQ_END].add(SEQ_END_WF)
        self.lemma_case[SEQ_END] = SEQ_END_WF.case
        self.lemma_counter[SEQ_END] = sys.maxsize

    def __process_line(self, line: str) -> None:
        try:
            text, lemma, pos_tag, grammemes = line.strip().split("\t")[:4]
            lemma = lemma.lower() + '_' + pos_tag
            gram_vector_index = self.grammeme_vectorizer.add_grammemes(
                pos_tag, grammemes)
            self.lemma_to_word_forms[lemma].add(
                WordForm(lemma, gram_vector_index, text.lower()))
            self.lemma_counter[lemma] += 1
            self.__update_lemma_case(lemma, text)
        except ValueError:
            pass

    def __update_lemma_case(self, lemma: str, text: str) -> None:
        if lemma not in self.lemma_case:
            self.lemma_case[lemma] = LemmaCase.UPPER_CASE if text.isupper() else \
                              LemmaCase.PROPER_CASE if text[0].isupper() else LemmaCase.NORMAL_CASE
        elif self.lemma_case[lemma] == LemmaCase.UPPER_CASE:
            if not text.isupper():
                self.lemma_case[lemma] = LemmaCase.PROPER_CASE if text[
                    0].isupper() else LemmaCase.NORMAL_CASE
        elif self.lemma_case[lemma] == LemmaCase.PROPER_CASE:
            if not text[0].isupper():
                self.lemma_case[lemma] = LemmaCase.NORMAL_CASE
예제 #6
0
파일: lstm.py 프로젝트: DanAnastasyev/rupo
class LSTMGenerator:
    """
    Языковая модель на основе двухуровневой LSTM RNN.
    """
    def __init__(self,
                 embedding_size: int = 30000,
                 softmax_size: int = 60000,
                 external_batch_size: int = 10000,
                 nn_batch_size: int = 768,
                 sentence_maxlen: int = 10,
                 lstm_units=368,
                 embeddings_dimension: int = 150,
                 grammeme_dense_units: List[int] = [35, 15],
                 dense_units: int = 256):
        """
        :param embeddings_size: размер входного слоя (=размер словаря)
        :param softmax_size: размер выхода softmax-слоя (=размер итогового набора вероятностей)
        :param external_batch_size: размер набора семплов для BatchGenerator'а.
        :param nn_batch_size: размер набора семплов для обучения.
        :param sentence_maxlen: маскимальная длина куска предложения.
        """
        self.embedding_size = embedding_size  # type: int
        self.softmax_size = softmax_size  # type: int
        self.external_batch_size = external_batch_size  # type: int
        self.nn_batch_size = nn_batch_size  # type: int
        self.sentence_maxlen = sentence_maxlen  # type: int
        self.word_form_vocabulary = None  # type: WordFormVocabulary
        self.grammeme_vectorizer = None  # type: GrammemeVectorizer
        self.lstm_units = lstm_units  # type: int
        self.embeddings_dimension = embeddings_dimension  # type: int
        self.grammeme_dense_units = grammeme_dense_units  # type: List[int]
        self.dense_units = dense_units  # type: int
        self.model = None  # type: Model

    def prepare(
            self,
            filenames: List[str] = list(),
            word_form_vocab_dump_path: str = GENERATOR_WORD_FORM_VOCAB_PATH,
            gram_dump_path: str = GENERATOR_GRAM_VECTORS) -> None:
        """
        Подготовка векторизатора грамматических значений и словаря словоформ по корпусу.
        
        :param filenames: имена файлов с морфоразметкой.
        :param word_form_vocab_dump_path: путь к дампу словаря словоформ.
        :param gram_dump_path: путь к векторам грамматических значений.
        """
        self.grammeme_vectorizer = GrammemeVectorizer(gram_dump_path)
        self.word_form_vocabulary = WordFormVocabulary(
            word_form_vocab_dump_path)
        if self.grammeme_vectorizer.is_empty(
        ) or self.word_form_vocabulary.is_empty():
            loader = CorporaInformationLoader()

            self.word_form_vocabulary, self.grammeme_vectorizer = loader.parse_corpora(
                filenames)
            self.grammeme_vectorizer.save()
            self.word_form_vocabulary.save()

    def load(self, model_filename: str) -> None:
        """
        Загрузка модели.
        
        :param model_filename: файл с моделью.
        """
        self.model = load_model(model_filename)

    def load_with_weights(self, json_filename: str,
                          weights_filename: str) -> None:
        """
        Загрузка модели из json описания и файла с весами.
        
        :param json_filename: json описание.
        :param weights_filename: файл с весам.
        """
        json_string = open(json_filename, 'r', encoding='utf8').readline()
        self.model = model_from_json(json_string)
        self.model.load_weights(weights_filename)

    def build(self):
        """
        Описание модели.
        """
        # Вход лемм
        lemmas = Input(shape=(None, ), name='lemmas')
        lemmas_embedding = Embedding(self.embedding_size + 1,
                                     self.embeddings_dimension,
                                     name='embeddings')(lemmas)
        lemmas_embedding = SpatialDropout1D(.3)(lemmas_embedding)

        # Вход граммем
        grammemes_input = Input(
            shape=(None, self.grammeme_vectorizer.grammemes_count()),
            name='grammemes')
        grammemes_layer = Masking(mask_value=0.)(grammemes_input)
        for grammeme_dense_layer_units in self.grammeme_dense_units:
            grammemes_layer = Dense(grammeme_dense_layer_units,
                                    activation='relu')(grammemes_layer)

        layer = Merge(mode='concat',
                      name='LSTM_input')([lemmas_embedding, grammemes_layer])
        layer = LSTM(self.lstm_units,
                     dropout=.2,
                     recurrent_dropout=.2,
                     return_sequences=True,
                     name='LSTM_1')(layer)
        layer = LSTM(self.lstm_units,
                     dropout=.2,
                     recurrent_dropout=.2,
                     return_sequences=False,
                     name='LSTM_2')(layer)

        layer = Dense(self.dense_units)(layer)
        layer = BatchNormalization()(layer)
        layer = Activation('relu')(layer)

        output = Dense(self.softmax_size + 1, activation='softmax')(layer)

        self.model = Model(inputs=[lemmas, grammemes_input], outputs=[output])
        self.model.compile(loss='sparse_categorical_crossentropy',
                           optimizer='adam')
        print(self.model.summary())

    @staticmethod
    def __get_validation_data(batch_generator, size):
        """
        Берет первые size батчей и batch_generator для валидационной выборки
        """
        lemmas_list, grammemes_list, y_list = [], [], []
        for lemmas, grammemes, y in islice(batch_generator, size):
            lemmas_list.append(lemmas)
            grammemes_list.append(grammemes)
            y_list.append(y)
        return np.vstack(lemmas_list), np.vstack(grammemes_list), np.hstack(
            y_list)

    def train(self,
              filenames: List[str],
              validation_size: int = 5,
              validation_verbosity: int = 5,
              dump_model_freq: int = 10) -> None:
        """
        Обучение модели.
        
        :param filenames: имена файлов с морфоразметкой.
        """
        batch_generator = BatchGenerator(
            filenames,
            batch_size=self.external_batch_size,
            embedding_size=self.embedding_size,
            softmax_size=self.softmax_size,
            sentence_maxlen=self.sentence_maxlen,
            word_form_vocabulary=self.word_form_vocabulary,
            grammeme_vectorizer=self.grammeme_vectorizer)

        lemmas_val, grammemes_val, y_val = LSTMGenerator.__get_validation_data(
            batch_generator, validation_size)
        for big_epoch in range(0, 1000):
            print('------------Big Epoch {}------------'.format(big_epoch))
            for epoch, (lemmas, grammemes, y) in enumerate(batch_generator):
                if epoch < validation_size:
                    continue
                self.model.fit([lemmas, grammemes],
                               y,
                               batch_size=self.nn_batch_size,
                               epochs=1,
                               verbose=2)

                if epoch != 0 and epoch % validation_verbosity == 0:
                    print(
                        'val loss:',
                        self.model.evaluate([lemmas_val, grammemes_val],
                                            y_val,
                                            batch_size=self.nn_batch_size * 2,
                                            verbose=0))

                indices = [
                    self.word_form_vocabulary.get_sequence_end_index(
                        SEQ_END_WF)
                ]
                for _ in range(10):
                    indices.append(self._sample(self.predict(indices)))
                sentence = [
                    self.word_form_vocabulary.get_word_form_by_index(index)
                    for index in indices
                ]
                print('Sentence', str(big_epoch), str(epoch), end=': ')
                for word in sentence[::-1]:
                    print(word.text, end=' ')
                print()

                if epoch != 0 and epoch % dump_model_freq == 0:
                    self.model.save(GENERATOR_LSTM_MODEL_PATH)

    def predict(self, word_indices: List[int]) -> np.array:
        """
        Предсказание вероятностей следующего слова.
        
        :param word_indices: индексы предыдущих слов.
        :return: проекция языковой модели (вероятности следующего слова).
        """
        if len(word_indices) == 0:
            return np.full(self.softmax_size,
                           1.0 / self.softmax_size,
                           dtype=np.float)
        cur_sent = [
            self.word_form_vocabulary.get_word_form_by_index(ind)
            for ind in word_indices
        ]
        x_lemmas = np.zeros((1, len(cur_sent)))
        x_grammemes = np.zeros(
            (1, len(cur_sent), self.grammeme_vectorizer.grammemes_count()))
        for index, word in enumerate(cur_sent):
            x_lemmas[
                0, index] = self.word_form_vocabulary.get_word_form_index_min(
                    word, self.softmax_size)
            x_grammemes[0, index] = self.grammeme_vectorizer.vectors[
                word.gram_vector_index]
        prob = self.model.predict([x_lemmas, x_grammemes], verbose=0)[0]
        return prob

    @staticmethod
    def _sample(prob: np.array, temperature: float = 1.0) -> int:
        """
        Выбор слова по набору вероятностей с заданной температурой (распределение Больцмана).
        
        :param prob: вероятности.
        :param temperature: температура.
        :return: индекс итогового слова.
        """
        prob = prob[:-1]  # Для исключения неизвестных слов.
        prob = np.log(prob) / temperature
        prob = np.exp(prob) / np.sum(np.exp(prob))
        return np.random.choice(len(prob), p=prob)