def test_char_map(self): id_map = IDMap() id_map.read(self.id_path) char_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD]) char_id_map = char_map.convert_to_characters(id_map) self.assertEqual(len(char_id_map.word_to_count.keys()), 46)
def test_morph_map(self): id_map = IDMap() id_map.read(self.id_path) original_vocab_size = len(id_map.word_to_count.keys()) print(original_vocab_size) morph_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD]) morph_map.train_morph_parser(id_map) morph_id_map = morph_map.convert_to_morphemes(id_map) new_vocab_size = len(morph_id_map.word_to_count.keys()) print(new_vocab_size) self.assertGreater(original_vocab_size, new_vocab_size)
def convert_to_characters(self, id_map): char_id_map = IDMap(self.special_list) for id in id_map.get_IDs(): word = id_map.ID_to_word[id] if word not in self.special_list: self.word_to_subwords[word] = [c for c in word] for c in word: if c in char_id_map.word_to_count: char_id_map.word_to_count[c] += 1 else: char_id_map.word_to_count[c] = 0 else: self.word_to_subwords[word] = [word] if word in char_id_map.word_to_count: char_id_map.word_to_count[word] += 1 else: char_id_map.word_to_count[word] = 0 # set IDs id = 0 for subword in char_id_map.word_to_count: char_id_map.ID_to_word[id] = subword char_id_map.word_to_ID[subword] = id id += 1 char_id_map.re_ID_by_Freq() return char_id_map
def test_read_write(self): id_map = IDMap() id_map.read(self.id_path) morph_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD]) morph_map.train_morph_parser(id_map) morph_id_map = morph_map.convert_to_morphemes(id_map) old_vocab_size = len(morph_id_map.word_to_count.keys()) morph_map.write(self.write_out+'morph_map.txt') morph_map.write_morph_model(self.write_out+'morph_model.model') morph_map = SubwordMap(special_list = [END_OF_SENTENCE_MARKER, UNKOWN_WORD]) morph_map.read(self.write_out+'morph_map.txt') morph_map.read_morph_model(self.write_out+'morph_model.model') new_vocab_size = len(morph_id_map.word_to_count.keys()) self.assertEqual(old_vocab_size, new_vocab_size)
def convert_to_morphemes(self, id_map, morph_model = None, train_params = {'count_func':'log'}): morph_id_map = IDMap(self.special_list) if morph_model is not None: self.model = morph_model # trains the model if none is already given if self.model is None: self.model = self.train_morph_parser(id_map, train_params) for id in id_map.get_IDs(): word = id_map.ID_to_word[id] if word not in self.special_list: subwords = self.model.viterbi_segment(word)[0] #print(subwords) self.word_to_subwords[word] = subwords for subword in subwords: if subword in morph_id_map.word_to_count: morph_id_map.word_to_count[subword] += 1 else: morph_id_map.word_to_count[subword] = 1 else: self.word_to_subwords[word] = [word] if word in morph_id_map.word_to_count: morph_id_map.word_to_count[word] += 1 else: morph_id_map.word_to_count[word] = 1 # set IDs id = 0 for subword in morph_id_map.word_to_count: morph_id_map.ID_to_word[id] = subword morph_id_map.word_to_ID[subword] = id id += 1 morph_id_map.re_ID_by_Freq() return morph_id_map