def _create_mm(self, motif_num, alphabet): try: # Only EDeN has original_motives_list input_motif = self.original_motives_list[motif_num - 1] except AttributeError: input_motif = self.motives_list[motif_num - 1] headers, instances = [list(x) for x in zip(*input_motif)] lengths = [len(instances[i]) for i in range(len(instances))] median_len = int(math.ceil(np.median(lengths))) # Hidden states for Markov Model states = [str(i + 1) for i in range(median_len)] print "original samples: %d" % len(instances) print "states:", len(states) # under sampling if (len(instances) * len(states)) > 500: samples = 500 / len(states) # samples = 50 # fixed sampling print 'sample size = %d' % samples instances = random.sample(instances, samples) instances = random.sample(instances, samples) try: mm = MarkovModel.train_bw(states=states, alphabet=alphabet, training_data=instances) except RuntimeError, msg: raise RuntimeError("Motif data is too large. " + str(msg))
def _create_mm(self, motif_num, alphabet): try: # Only EDeN has original_motives_list input_motif = self.original_motives_list[motif_num - 1] except AttributeError: input_motif = self.motives_list[motif_num - 1] headers, instances = [list(x) for x in zip(*input_motif)] lengths = [len(instances[i]) for i in range(len(instances))] median_len = int(math.ceil(np.median(lengths))) # Hidden states for Markov Model states = [str(i + 1) for i in range(median_len)] print "original samples: %d" % len(instances) print "states:", len(states) # under sampling if (len(instances) * len(states)) > 500: samples = 500 / len(states) # samples = 50 # fixed sampling print 'sample size = %d' % samples instances = random.sample(instances, samples) instances = random.sample(instances, samples) try: mm = MarkovModel.train_bw(states=states, alphabet=alphabet, training_data=instances) except RuntimeError, msg: raise RuntimeError("Motif data is too large. " + str(msg))
def test_train_bw(self): random.seed(0) states = ["0", "1", "2", "3"] alphabet = ["A", "C", "G", "T"] training_data = [ "AACCCGGGTTTTTTT", "ACCGTTTTTTT", "ACGGGTTTTTT", "ACCGTTTTTTTT", ] output_p_initial = array([0.2275677, 0.29655611, 0.24993822, 0.22593797]) output_p_transition = array( [ [5.16919807e-001, 3.65825814e-033, 4.83080193e-001, 9.23220689e-042], [3.65130247e-001, 1.00000000e-300, 6.34869753e-001, 1.00000000e-300], [8.68776164e-001, 1.02254304e-034, 1.31223836e-001, 6.21835051e-047], [3.33333333e-301, 3.33333333e-001, 3.33333333e-301, 6.66666667e-001], ] ) output_p_emission = array( [ [2.02593570e-301, 2.02593570e-301, 2.02593570e-301, 1.00000000e000], [1.00000000e-300, 1.00000000e-300, 1.00000000e000, 1.09629016e-259], [3.26369779e-301, 3.26369779e-301, 3.26369779e-301, 1.00000000e000], [3.33333333e-001, 6.66666667e-001, 3.33333333e-301, 3.33333333e-301], ] ) markov_model = MarkovModel.train_bw(states, alphabet, training_data) self.assertEqual("".join(markov_model.states), "".join(states)) self.assertEqual("".join(markov_model.alphabet), "".join(alphabet)) self.assertTrue( array_equal( around(markov_model.p_initial, decimals=3), around(output_p_initial, decimals=3), ) ) self.assertTrue( array_equal( around(markov_model.p_transition, decimals=3), around(output_p_transition, decimals=3), ) ) self.assertTrue( array_equal( around(markov_model.p_emission, decimals=3), around(output_p_emission, decimals=3), ) )
def test_train_bw(self): random.seed(0) states = ["0", "1", "2", "3"] alphabet = ["A", "C", "G", "T"] training_data = ["AACCCGGGTTTTTTT", "ACCGTTTTTTT", "ACGGGTTTTTT", "ACCGTTTTTTTT"] output_p_initial = array([0.2275677, 0.29655611, 0.24993822, 0.22593797]) output_p_transition = array( [[5.16919807e-001, 3.65825814e-033, 4.83080193e-001, 9.23220689e-042], [3.65130247e-001, 1.00000000e-300, 6.34869753e-001, 1.00000000e-300], [8.68776164e-001, 1.02254304e-034, 1.31223836e-001, 6.21835051e-047], [3.33333333e-301, 3.33333333e-001, 3.33333333e-301, 6.66666667e-001]]) output_p_emission = array( [[2.02593570e-301, 2.02593570e-301, 2.02593570e-301, 1.00000000e+000], [1.00000000e-300, 1.00000000e-300, 1.00000000e+000, 1.09629016e-259], [3.26369779e-301, 3.26369779e-301, 3.26369779e-301, 1.00000000e+000], [3.33333333e-001, 6.66666667e-001, 3.33333333e-301, 3.33333333e-301]]) markov_model = MarkovModel.train_bw(states, alphabet, training_data) self.assertEqual(''.join(markov_model.states), ''.join(states)) self.assertEqual(''.join(markov_model.alphabet), ''.join(alphabet)) self.assertTrue(array_equal( around(markov_model.p_initial, decimals=3), around(output_p_initial, decimals=3))) self.assertTrue(array_equal(around( markov_model.p_transition, decimals=3), around(output_p_transition, decimals=3))) self.assertTrue(array_equal(around( markov_model.p_emission, decimals=3), around(output_p_emission, decimals=3)))