def do_subclass_setup(cls): cls.name = "Coin example" cls.min_frac_equal = 0.69 cls.generating_hmm = FirstOrderHMM( **{ "state_priors": ArrayFactor([0.005, 0.995]), "trans_probs": MatrixFactor(numpy.array([[0.8, 0.2], [0.3, 0.7]])), "emission_probs": [ArrayFactor([0.6, 0.4]), ArrayFactor([0.15, 0.85])], }) cls.hmm_dict = { "emission_probs": [], "state_priors": { "shape": (1, 2), "row": [0, 0], "col": [0, 1], "data": [0.005, 0.995], }, "trans_probs": { "shape": (2, 2), "row": [0, 0, 1, 1], "col": [0, 1, 0, 1], "data": [0.8, 0.2, 0.3, 0.7], } }
def setUpClass(cls): cls.do_subclass_setup() print("Setting up tests for %s" % cls.test_name) cls.generating_hmm = FirstOrderHMM( state_priors=ArrayFactor(cls.arrays["state_priors"]), trans_probs=MatrixFactor(cls.arrays["transition_probs"]), emission_probs=cls.get_emission_probs()) cls.naive_hmm = FirstOrderHMM( state_priors=ArrayFactor(cls.arrays["naive_state_priors"]), trans_probs=MatrixFactor(cls.arrays["naive_transition_probs"]), emission_probs=cls.get_naive_emission_probs()) # generate 100 training examples cls.training_examples = [ cls.generating_hmm.generate(cls.example_len)[1] for _ in range(cls.num_examples) ] # retrain cls.training_results = train_baum_welch( cls.naive_hmm, cls.training_examples, state_prior_estimator=cls.state_prior_estimator, transition_estimator=cls.transition_estimator, emission_estimator=cls.emission_estimator, miniter=80, maxiter=1000, processes=1, logfunc=DefaultLoggerFactory(ScreenWriter(), cls.naive_hmm, maxcols=5), )
def test_from_dict_with_hmm(self): for (starting_order, num_states), model in sorted(self.models.items()): if starting_order >= 5: continue state_priors = numpy.random.random(model.low_order_states) state_priors /= state_priors.sum() trans_probs = numpy.random.random( (model.low_order_states, model.low_order_states)) trans_probs = (trans_probs.T / trans_probs.sum(1)).T my_hmm = FirstOrderHMM(state_priors=ArrayFactor(state_priors), trans_probs=MatrixFactor(trans_probs), emission_probs=[None] * model.low_order_states) dtmp = { "starting_order": starting_order, "num_states": num_states, "hmm": my_hmm, } found = ModelReducer._from_dict(dtmp) expected = ModelReducer(starting_order, num_states) expected.hmm = my_hmm for k in ("starting_order", "high_order_states"): yield check_equal, getattr(found, k), getattr(expected, k) for k in ("trans_probs", "state_priors"): yield check_array_equal, getattr(found.hmm, k).data, getattr(my_hmm, k).data
def test_to_dict_with_hmm(self): for (starting_order, num_states), model in sorted(self.models.items()): if starting_order >= 5: continue state_priors = numpy.random.random(model.low_order_states) state_priors /= state_priors.sum() trans_probs = numpy.random.random( (model.low_order_states, model.low_order_states)) trans_probs = (trans_probs.T / trans_probs.sum(1)).T my_hmm = FirstOrderHMM(state_priors=ArrayFactor(state_priors), trans_probs=MatrixFactor(trans_probs), emission_probs=[None] * model.low_order_states) model.hmm = my_hmm with warnings.catch_warnings(): warnings.simplefilter("ignore") found = model._to_dict() expected = { "starting_order": starting_order, "num_states": num_states, "hmm": my_hmm, } model.hmm = None for k in expected: if k == "first_order_hmm": yield check_dict_equal, found[k], expected[k] else: yield check_equal, found[k], expected[k]
def do_subclass_setup(cls): cls.name = "Two gaussian example" cls.min_frac_equal = 0.7 transitions = numpy.matrix([[0.9, 0.1], [0.25, 0.75]]) cls.generating_hmm = FirstOrderHMM( **{ "trans_probs": MatrixFactor(transitions), "state_priors": ArrayFactor([0.8, 0.2]), "emission_probs": [ ScipyDistributionFactor(scipy.stats.norm, loc=0, scale=0.5), ScipyDistributionFactor(scipy.stats.norm, loc=5, scale=10) ], }) cls.hmm_dict = { "emission_probs": [], "state_priors": { "shape": (1, 2), "row": [0, 0], "col": [0, 1], "data": [0.8, 0.2], }, "trans_probs": { "shape": (2, 2), "row": [0, 0, 1, 1], "col": [0, 1, 0, 1], "data": [0.9, 0.1, 0.25, 0.75], } }
def do_subclass_setup(cls): for my_len in range(10, 100, 200): ary = numpy.random.random(my_len) ary /= ary.sum() cls.factors.append(ArrayFactor(ary)) cls.examples.append([ (X, ) for X in numpy.random.randint(0, high=my_len, size=50) ])
def construct_factors(self, model, reduced_data, noise_weight=0, pseudocount_weight=1e-10): """Construct discrete emission factor for an HMM using reduced data from observation sequences model : :class:`~minihmm.hmm.FirstOrderHMM` or subclass reduced_data : numpy.ndarray sufficient statistics for observations, from :meth:`PseudocountEmissionEstimator.reduce_data` noise_weight : float, optional weight of noise to add, relative to number of of observations (e.g. transition counts, state prior counts, emission counts, et c) in data set. (Default: 0) pseudocount_weight : float, optional weight of pseudocounts to add, relative to number of of observations (transition counts, state prior counts, emission counts, et c) in data set (Default: 1e-8) Returns ------- list list of :class:`~minihmm.factors.ArrayFactor` objects representing emission probabilities for each state """ E = sum(reduced_data) E_sum = E.sum() E += get_model_noise(E, noise_weight, assymetric_weights=self.pseudocount_array) E += (pseudocount_weight * E_sum * self.pseudocount_array) / self.pseudocount_array.sum() E_normed = (E.T / E.sum(1)).T emission_factors = [] for i in range(E_normed.shape[0]): emission_factors.append(ArrayFactor(E_normed[i, :])) return emission_factors
def construct_factors(self, model, reduced_data, noise_weight=0, pseudocount_weight=1e-10): """Construct discrete state prior factor for an HMM using reduced data from observation sequences Parameters ---------- model : :class:`~minihmm.hmm.FirstOrderHMM` or subclass reduced_data : numpy.ndarray sufficient statistics for observations, from :meth:`PseudocountStatePriorEstimator.reduce_data` noise_weight : float, optional weight of noise to add, relative to number of of observations (e.g. transition counts, state prior counts, emission counts, et c) in data set. (Default: 0) pseudocount_weight : float, optional weight of pseudocounts to add, relative to number of of observations (transition counts, state prior counts, emission counts, et c) in data set (Default: 1e-8) Returns ------- :class:`~minihmm.factors.MatrixFactor` Transition probability factor """ pi = sum(reduced_data) pi_sum = pi.sum() pi += get_model_noise(pi, noise_weight, assymetric_weights=self.pseudocount_array) pi += (pseudocount_weight * pi_sum * self.pseudocount_array) / self.pseudocount_array.sum() pi_normed = pi / pi.sum() state_priors = ArrayFactor(pi_normed) return state_priors
def construct_factors(self, model, reduced_data, noise_weight=0, pseudocount_weight=1e-10): """Construct state prior factor for an HMM using reduced data from observation sequences Parameters ---------- model : :class:`~minihmm.hmm.FirstOrderHMM` or subclass reduced_data : numpy.ndarray sufficient statistics for observations, from :meth:`TiedStatePriorEstimator.reduce_data` noise_weight : float, optional weight of noise to add, relative to number of of observations (e.g. transition counts, state prior counts, emission counts, et c) in data set. (Default: 0) pseudocount_weight : float, optional weight of pseudocounts to add, relative to number of of observations (transition counts, state prior counts, emission counts, et c) in data set (Default: 1e-8) Returns ------- :class:`~minihmm.factors.ArrayFactor` Tied state priors """ pi_raw = sum(reduced_data) pi_sum = pi_raw.sum() reduced_vector = numpy.zeros(1 + max(self.index_map)) for i, val in enumerate(pi_raw): reduced_vector[self.index_map[i]] += pi_raw[i] # add noise reduced_vector += get_model_noise( reduced_vector, noise_weight, assymetric_weights=self.index_weights ) # FIXME: THIS WILL ADD NOISE TO FORBIDDEN CELLS # divide each starting cell by number of destination cells reduced_vector /= self.index_weights # populate destination vector pi_proc = numpy.zeros_like(pi_raw, dtype=float) for i in range(len(pi_proc)): pi_proc[i] = reduced_vector[self.index_map[i]] # add pseudocounts pi_proc += (pseudocount_weight * pi_sum * self.pseudocount_array) / self.pseudocount_array.sum() pi_proc *= self.pseudocount_mask # re-zero forbidden cells that received noise during tying # normalize pi_proc /= pi_proc.sum() return ArrayFactor(pi_proc)
def remap_from_first_order(self, native_hmm): """Remap parameters from a native first order HMM onto a first-order translation of a high-order HMM, in order to, for example, provide a reasonable non-random starting point for refinement training of the high-order HMM. Parameters ---------- native_hmm : :class:`minihmm.hmm.FirstOrderHMM` Native, first-order HMM, preferably with trained parameters Returns ------- :class:`~minihmm.hmm.FirstOrderHMM` First-order representation of the high-order HMM structure described by `self`, with parameters from `native_hmm` remapped into corresponding positions. """ htl = self.high_states_to_low # check that number of states is compatible if self.high_order_states != native_hmm.num_states: raise ValueError( "Native HMM (%d states), has different number of states than `self` (%d states)" % (native_hmm.num_states, self.high_order_states) ) # For transitions # Each high-order state transitiono `(n-i, ... , n-1) -> (n-i+1 , ... , n)` # should be mapped to appropiate transformations of the parameters (n - 1 , n) # For state priors and emission probabilities # each high-order state (n-i, ..., n) should be given the parameters matching # native state `n` # will need to make appropriate state-tying matrices for emissions, as well sp_source = native_hmm.state_priors.data sp_dest = numpy.zeros(self.low_order_states, dtype=float) trans_source = native_hmm.trans_probs.data trans_dest = numpy.zeros((self.low_order_states, self.low_order_states), dtype=float) em_source = native_hmm.emission_probs em_dest = [None] * self.low_order_states for my_tuple, trans_state in htl.items(): native_state = my_tuple[-1] sp_dest[trans_state] = sp_source[native_state] em_dest[trans_state] = copy.deepcopy(em_source[native_state]) for next_native_state in range(self.high_order_states): next_tuple = tuple(list(my_tuple)[1:] + [next_native_state]) next_trans_state = htl[next_tuple] trans_dest[trans_state, next_trans_state] = trans_source[native_state, next_native_state] # renormalize sp_dest /= sp_dest.sum() sp_dest = ArrayFactor(sp_dest) # shoudln't have to renormalize; check this trans_dest = (trans_dest.T / trans_dest.sum(1)).T trans_dest = MatrixFactor(trans_dest) return FirstOrderHMM(state_priors=sp_dest, emission_probs=em_dest, trans_probs=trans_dest)
def get_naive_emission_probs(cls): return [ArrayFactor(X) for X in cls.arrays["naive_emission_probs"]]