def test_char_map(self):
        id_map = IDMap()
        id_map.read(self.id_path)

        char_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD])
        char_id_map = char_map.convert_to_characters(id_map)

        self.assertEqual(len(char_id_map.word_to_count.keys()), 46)
    def test_morph_map(self):
        id_map = IDMap()
        id_map.read(self.id_path)

        original_vocab_size = len(id_map.word_to_count.keys())
        print(original_vocab_size)

        morph_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD])
        morph_map.train_morph_parser(id_map)
        morph_id_map = morph_map.convert_to_morphemes(id_map)

        new_vocab_size = len(morph_id_map.word_to_count.keys())
        print(new_vocab_size)

        self.assertGreater(original_vocab_size, new_vocab_size)
    def convert_to_characters(self, id_map):
        char_id_map = IDMap(self.special_list)

        for id in id_map.get_IDs():
            word = id_map.ID_to_word[id]
            if word not in self.special_list:
                self.word_to_subwords[word] = [c for c in word]
                for c in word:
                    if c in char_id_map.word_to_count:
                        char_id_map.word_to_count[c] += 1
                    else:
                        char_id_map.word_to_count[c] = 0
            else:
                self.word_to_subwords[word] = [word]
                if word in char_id_map.word_to_count:
                    char_id_map.word_to_count[word] += 1
                else:
                    char_id_map.word_to_count[word] = 0

        # set IDs
        id = 0
        for subword in char_id_map.word_to_count:
            char_id_map.ID_to_word[id] = subword
            char_id_map.word_to_ID[subword] = id
            id += 1
        char_id_map.re_ID_by_Freq()

        return char_id_map
    def test_read_write(self):
        id_map = IDMap()
        id_map.read(self.id_path)

        morph_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD])
        morph_map.train_morph_parser(id_map)
        morph_id_map = morph_map.convert_to_morphemes(id_map)
        old_vocab_size = len(morph_id_map.word_to_count.keys())

        morph_map.write(self.write_out+'morph_map.txt')
        morph_map.write_morph_model(self.write_out+'morph_model.model')

        morph_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD])
        morph_map.read(self.write_out+'morph_map.txt')
        morph_map.read_morph_model(self.write_out+'morph_model.model')
        new_vocab_size = len(morph_id_map.word_to_count.keys())


        self.assertEqual(old_vocab_size, new_vocab_size)
    def convert_to_morphemes(self, id_map, morph_model = None, train_params = {'count_func':'log'}):
        morph_id_map = IDMap(self.special_list)

        if morph_model is not None:
            self.model = morph_model

        # trains the model if none is already given
        if self.model is None:
            self.model = self.train_morph_parser(id_map, train_params)

        for id in id_map.get_IDs():
            word = id_map.ID_to_word[id]
            if word not in self.special_list:
                subwords = self.model.viterbi_segment(word)[0]
                #print(subwords)
                self.word_to_subwords[word] = subwords
                for subword in subwords:
                    if subword in morph_id_map.word_to_count:
                        morph_id_map.word_to_count[subword] += 1
                    else:
                        morph_id_map.word_to_count[subword] = 1
            else:
                self.word_to_subwords[word] = [word]
                if word in morph_id_map.word_to_count:
                    morph_id_map.word_to_count[word] += 1
                else:
                    morph_id_map.word_to_count[word] = 1

        # set IDs
        id = 0
        for subword in morph_id_map.word_to_count:
            morph_id_map.ID_to_word[id] = subword
            morph_id_map.word_to_ID[subword] = id
            id += 1
        morph_id_map.re_ID_by_Freq()

        return morph_id_map