def train(self): """Train HMMs""" initial_hmm_files = self.get_initial_hmm_files() if len(initial_hmm_files) == 0: raise ModelException, "No initial HMM files found." if not os.path.exists(self.TRAIN_HMM_ROOT): os.makedirs(self.TRAIN_HMM_ROOT) trainer = self.Trainer() viterbi_trainer = ViterbiTrainer(self.ViterbiCalculator(), non_diagonal=self.NON_DIAGONAL) for file in initial_hmm_files: char_code = int(os.path.basename(file).split(".")[0]) hmm = MultivariateHmm.from_file(file) sset_file = os.path.join(self.TRAIN_FEATURES_ROOT, str(char_code) + ".sset") sset = self.get_sequence_set(sset_file) output_file = os.path.join(self.TRAIN_HMM_ROOT, "%d.xml" % char_code) if self.TRAINING in (self.TRAINING_VITERBI, self.TRAINING_BOTH): self.print_verbose("Viterbi training: " + output_file) viterbi_trainer.train(hmm, sset) if self.TRAINING in (self.TRAINING_BAUM_WELCH, self.TRAINING_BOTH): self.print_verbose("Baum-Welch training: " + output_file) trainer.train(hmm, sset) hmm.write(output_file)
def init(self): """Init HMMs""" self.load_char_dicts() feature_files = self.get_train_feature_files() if len(feature_files) == 0: raise ModelException, "No feature files found." if not os.path.exists(self.INIT_HMM_ROOT): os.makedirs(self.INIT_HMM_ROOT) for sset_file in feature_files: char_code = int(os.path.basename(sset_file[:-5])) sset = self.get_sequence_set(sset_file) sset.char_code = char_code hmm = self.get_initial_hmm(sset) output_file = os.path.join(self.INIT_HMM_ROOT, "%d.xml" % char_code) self.print_verbose(output_file) hmm.write(output_file)
def init(self): self.load_char_dicts() feature_files = self.get_train_feature_files() if len(feature_files) == 0: raise ModelException, "No feature files found." if not os.path.exists(self.INIT_HMM_ROOT): os.makedirs(self.INIT_HMM_ROOT) ssets = [] # calculate the average number of observations for all characters n_observations = 0 n_characters = 0 for sset_file in feature_files: char_code = int(os.path.basename(sset_file[:-5])) sset = self.get_sequence_set(sset_file) sset.char_code = char_code ssets.append(sset) obs, chars = self.get_n_observations(sset) n_observations += obs n_characters += chars avg_n_obs_per_char = float(n_observations) / n_characters for sset in ssets: hmm = self.get_initial_hmm(sset, avg_n_obs_per_char) output_file = os.path.join(self.INIT_HMM_ROOT, "%d.xml" % sset.char_code) hmm.write(output_file)