Exemplo n.º 1
0
    def __init__(self,
                 dump_filename: str,
                 markup_path: str = None,
                 from_voc: bool = False) -> None:
        """
        :param dump_filename: файл, в который сохранется словарь.
        :param markup_path: файл/папка с разметками.
        """
        self.dump_filename = dump_filename
        self.word_to_index = {}  # type: Dict[StressedWord, int]
        self.index_to_word = {}  # type: Dict[int, StressedWord]

        if os.path.isfile(self.dump_filename):
            self.load()
        elif markup_path is not None:
            if from_voc:
                word_indexes = Reader.read_vocabulary(markup_path)
                for word, index in word_indexes:
                    self.add_word(word.to_stressed_word(), index)
            else:
                markups = Reader.read_markups(markup_path,
                                              FileType.XML,
                                              is_processed=True)
                for markup in markups:
                    self.add_markup(markup)
            self.save()
Exemplo n.º 2
0
 def parse(self, markup_path: str, from_voc: bool = False):
     if from_voc:
         word_indexes = Reader.read_vocabulary(markup_path)
         for word, index in word_indexes:
             self.add_word(word.to_stressed_word(), index)
     else:
         markups = Reader.read_markups(markup_path,
                                       FileType.XML,
                                       is_processed=True)
         for markup in markups:
             self.add_markup(markup)
Exemplo n.º 3
0
    def __init__(self,
                 dump_filename: str,
                 vocabulary: StressVocabulary,
                 markup_dump_path: str = None,
                 n_poems: int = None,
                 n_grams: int = 2):
        self.n_grams = n_grams
        self.transitions = defaultdict(Counter)  # type: Dict[Tuple, Counter]
        self.vocabulary = vocabulary
        self.dump_filename = dump_filename

        # Делаем дамп модели для ускорения загрузки.
        if os.path.exists(self.dump_filename) and os.path.isfile(
                self.dump_filename):
            self.load()
        else:
            i = 0
            markups = Reader.read_markups(markup_dump_path,
                                          FileType.XML,
                                          is_processed=True)
            for markup in markups:
                self.add_markup(markup)
                i += 1
                if n_poems is not None and n_poems == i:
                    break
                if i % 500 == 0:
                    print(i)
            self.save()
Exemplo n.º 4
0
    def test_read(self):
        processed_xml = Reader.read_markups(MARKUP_XML_EXAMPLE,
                                            FileType.XML,
                                            is_processed=True)
        self.__assert_markup_is_correct(next(processed_xml))

        unprocessed_xml = Reader.read_markups(
            TEXT_XML_EXAMPLE,
            FileType.XML,
            is_processed=False,
            stress_predictor=self.stress_predictor)
        self.__assert_markup_is_correct(next(unprocessed_xml))

        processed_json = Reader.read_markups(MARKUP_JSON_EXAMPLE,
                                             FileType.JSON,
                                             is_processed=True)
        self.__assert_markup_is_correct(next(processed_json))
Exemplo n.º 5
0
    def test_write(self):
        temp_file = os.path.join(EXAMPLES_DIR, "temp.xml")
        markup = MARKUP_EXAMPLE
        Writer.write_markups(FileType.XML, [markup], temp_file)
        processed_xml = Reader.read_markups(temp_file,
                                            FileType.XML,
                                            is_processed=True)
        self.assertEqual(next(processed_xml), markup)
        processed_xml.close()
        os.remove(temp_file)

        temp_file = os.path.join(EXAMPLES_DIR, "temp.txt")
        Writer.write_markups(FileType.RAW, [markup], temp_file)
        processed_raw = Reader.read_markups(temp_file,
                                            FileType.RAW,
                                            is_processed=True)
        self.assertIsInstance((next(processed_raw)), Markup)
        processed_raw.close()
        os.remove(temp_file)
Exemplo n.º 6
0
Arquivo: api.py Projeto: che1974/rupo
    def generate_markups(self, input_path: str, input_type: FileType,
                         output_path: str, output_type: FileType) -> None:
        """
        Генерация разметок по текстам.

        :param input_path: путь к папке/файлу с текстом.
        :param input_type: тип файлов с текстов.
        :param output_path: путь к файлу с итоговыми разметками.
        :param output_type: тип итогового файла.
        """
        markups = Reader.read_markups(input_path, input_type, False,
                                      self.get_stress_predictor())
        writer = Writer(output_type, output_path)
        writer.open()
        for markup in markups:
            writer.write_markup(markup)
        writer.close()