def convert_to_characters(self, id_map): char_id_map = IDMap(self.special_list) for id in id_map.get_IDs(): word = id_map.ID_to_word[id] if word not in self.special_list: self.word_to_subwords[word] = [c for c in word] for c in word: if c in char_id_map.word_to_count: char_id_map.word_to_count[c] += 1 else: char_id_map.word_to_count[c] = 0 else: self.word_to_subwords[word] = [word] if word in char_id_map.word_to_count: char_id_map.word_to_count[word] += 1 else: char_id_map.word_to_count[word] = 0 # set IDs id = 0 for subword in char_id_map.word_to_count: char_id_map.ID_to_word[id] = subword char_id_map.word_to_ID[subword] = id id += 1 char_id_map.re_ID_by_Freq() return char_id_map
def convert_to_morphemes(self, id_map, morph_model = None, train_params = {'count_func':'log'}): morph_id_map = IDMap(self.special_list) if morph_model is not None: self.model = morph_model # trains the model if none is already given if self.model is None: self.model = self.train_morph_parser(id_map, train_params) for id in id_map.get_IDs(): word = id_map.ID_to_word[id] if word not in self.special_list: subwords = self.model.viterbi_segment(word)[0] #print(subwords) self.word_to_subwords[word] = subwords for subword in subwords: if subword in morph_id_map.word_to_count: morph_id_map.word_to_count[subword] += 1 else: morph_id_map.word_to_count[subword] = 1 else: self.word_to_subwords[word] = [word] if word in morph_id_map.word_to_count: morph_id_map.word_to_count[word] += 1 else: morph_id_map.word_to_count[word] = 1 # set IDs id = 0 for subword in morph_id_map.word_to_count: morph_id_map.ID_to_word[id] = subword morph_id_map.word_to_ID[subword] = id id += 1 morph_id_map.re_ID_by_Freq() return morph_id_map