def load_segmentation_model(modeldata): model = HiddenMarkovModel('model') states = {} for s in modeldata: if len(s['emission']) == 1: emission = NormalDistribution(*s['emission'][0][:2]) else: weights = np.array([w for _, _, w in s['emission']]) dists = [NormalDistribution(mu, sigma) for mu, sigma, _ in s['emission']] emission = GeneralMixtureModel(dists, weights=weights) state = State(emission, name=s['name']) states[s['name']] = state model.add_state(state) if 'start_prob' in s: model.add_transition(model.start, state, s['start_prob']) for s in modeldata: current = states[s['name']] for nextstate, prob in s['transition']: model.add_transition(current, states[nextstate], prob) model.bake() return model
def train_hmm_tagger(data): # HMM # Use the tag unigrams and bigrams calculated above to construct a hidden Markov tagger. # # - Add one state per tag # - The emission distribution at each state should be estimated with the formula: $P(w|t) = \frac{C(t, w)}{C(t)}$ # - Add an edge from the starting state `basic_model.start` to each tag # - The transition probability should be estimated with the formula: $P(t|start) = \frac{C(start, t)}{C(start)}$ # - Add an edge from each tag to the end state `basic_model.end` # - The transition probability should be estimated with the formula: $P(end|t) = \frac{C(t, end)}{C(t)}$ # - Add an edge between _every_ pair of tags # - The transition probability should be estimated with the formula: $P(t_2|t_1) = \frac{C(t_1, t_2)}{C(t_1)}$ basic_model = HiddenMarkovModel(name="base-hmm-tagger") state_dict = {} states = [] emission_counts = pair_counts(*list(zip( *data.training_set.stream()))[::-1]) for tag in emission_counts.keys(): tag_count = tag_unigrams[tag] probs = {} for w in emission_counts[tag]: probs[w] = emission_counts[tag][w] / tag_count emission_p = DiscreteDistribution(probs) state = State(emission_p, name="" + tag) basic_model.add_state(state) state_dict[tag] = state for tag in tag_starts: basic_model.add_transition(basic_model.start, state_dict[tag], tag_starts[tag] / len(data.training_set.Y)) basic_model.add_transition(state_dict[tag], basic_model.end, tag_ends[tag] / tag_unigrams[tag]) for (tag1, tag2) in tag_bigrams: basic_model.add_transition( state_dict[tag1], state_dict[tag2], tag_bigrams[(tag1, tag2)] / tag_unigrams[tag1]) # finalize the model basic_model.bake() assert all( tag in set(s.name for s in basic_model.states) for tag in data.training_set.tagset ), "Every state in your network should use the name of the associated tag, which must be one of the training set tags." assert basic_model.edge_count() == 168, ( "Your network should have an edge from the start node to each state, one edge between every " + "pair of tags (states), and an edge from each state to the end node.") HTML( '<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>' ) return basic_model
class HMMWrapper: def __init__(self): self.model = HiddenMarkovModel() self.start = self.model.start self.end = self.model.end self.states_before_bake = [] self.states = None def add_state(self, state, start_prob=0): self.states_before_bake.append((state, start_prob)) self.model.add_state(state) def add_transition(self, start_state, end_state, prob): # print('adding from', start_state.name, 'to', end_state.name, prob) self.model.add_transition(start_state, end_state, prob) def bake(self): starter_states_no_prob = [] free_start_prob = 1.0 for state in self.states_before_bake: if 'none' not in state[0].name: if not state[1]: starter_states_no_prob.append(state) else: free_start_prob -= state[1] print('asignado ' + str(state[1]) + ' a ' + state[0].name) self.add_transition(self.start, state[0], state[1]) len_no_prob = len(starter_states_no_prob) starter_prob = free_start_prob / len_no_prob print(len_no_prob, starter_prob) for state in starter_states_no_prob: self.add_transition(self.start, state, starter_prob) self.model.bake() self.states = self.model.states def make_states_from_alignment(self, first_state, last_state, seq_matrix, name): columns = column_clasify(seq_matrix) zones = create_zones(columns) grouped_states = group_states(zones, name) add_states(self, grouped_states) trans = calculate_transitions(first_state, last_state, grouped_states) apply_transitions(self, trans) def predict(self, *args, **kwargs): return self.model.predict(*args, **kwargs)
def _initialize_new_hmm(hmm, new_states, new_transitions): new_hmm = HiddenMarkovModel() for state in new_states: if state not in (hmm.start, hmm.end): new_hmm.add_state(state) for source_state, target_state, probability in new_transitions: if source_state != hmm.start and target_state != hmm.end: new_hmm.add_transition(source_state, target_state, probability) elif source_state == hmm.start: new_hmm.add_transition(new_hmm.start, target_state, probability) elif target_state == hmm.end: new_hmm.add_transition(source_state, new_hmm.end, probability) new_hmm.bake() return new_hmm
def insert_delete_main_hmm(data_matrix): v_columns = column_clasify(data_matrix) v_zones = create_zones(v_columns) v_grouped_states = group_states(v_zones, 'test') v_model = HiddenMarkovModel() v_first_state = State(None, name='ali_start') v_last_state = State(None, name='ali_end') v_model.add_state(v_first_state) v_model.add_transition(v_model.start, v_first_state, 1) v_model.add_state(v_last_state) add_states(v_model, v_grouped_states) v_trans = calculate_transitions(v_first_state, v_last_state, v_grouped_states) apply_transitions(v_model, v_trans) v_model.bake() return v_model
class ModelWrapper: def __init__(self): self.model = HiddenMarkovModel() def add_state(self, distribution, name): state = State(distribution, name=name) self.model.add_state(state) return state def bake(self): self.model.bake() def viterbi(self, seq): return self.model.viterbi(seq) def add_transition(self, states, next_state_data): for state in states: for next_data in next_state_data: self.model.add_transition(state, next_data[0], next_data[1])
def create_hidden_MarkovModel(e_df, q_df, start_p_dict): """ Creates a Hidden Markov Model based on DataFrame @args: - e_df (pd.Dataframe): contains the emission probabilites - q_df (pd.Dataframe): contains the emission probabilites """ model = HiddenMarkovModel(name="Example Model") '#1: Create a dict for each key in trans. df' model_dict = {} for key in q_df.keys().values: model_dict[key] = {} '#2: Create the states' for key in model_dict: '#2.1.Step Add teh emission prob. to each state, , P(observation | state)' emission_p = DiscreteDistribution(e_df[key].to_dict()) sunny_state = State(emission_p, name=key) model_dict[key] = State(emission_p, name=key) model.add_state(model_dict[key]) '#2.2.Step: Add the start probability for each state' model.add_transition(model.start, model_dict[key], start_p_dict[key]) '#3.Step: Add the transition probability to each state' for key, item in q_df.to_dict("index").items(): for item_name, value in item.items(): print(key, " , ", item_name, ": ", value) tmp_origin = model_dict[key] tmp_destination = model_dict[item_name] model.add_transition(tmp_origin, tmp_destination, q_df.loc[key, item_name]) # finally, call the .bake() method to finalize the model model.bake() return model
'a': 0.25, 'c': 0.25, 'g': 0.25, 't': 0.25 }), name='back') fixed_state = State(DiscreteDistribution({ 'a': 0.45, 'c': 0.45, 'g': 0.05, 't': 0.05 }), name='fixed') hmmodel.add_state(back_state) hmmodel.add_state(fixed_state) hmmodel.add_transition(hmmodel.start, back_state, 1) hmmodel.add_transition(back_state, back_state, 0.9) hmmodel.add_transition(back_state, fixed_state, 0.1) hmmodel.add_transition(fixed_state, fixed_state, 0.9) hmmodel.add_transition(fixed_state, back_state, 0.1) hmmodel.bake() seq = list('acgtacgtaaaaccccaaa') lopg, path = hmmodel.viterbi(seq) print([x[1].name for x in path])
A simple example highlighting how to build a model using states, add transitions, and then run the algorithms, including showing how training on a sequence improves the probability of the sequence. """ import random from pomegranate import * from pomegranate import HiddenMarkovModel as Model random.seed(0) model = Model(name="ExampleModel") distribution = UniformDistribution(0.0, 1.0) state = State(distribution, name="uniform") state2 = State(NormalDistribution(0, 2), name="normal") silent = State(None, name="silent") model.add_state(state) model.add_state(state2) model.add_transition(state, state, 0.4) model.add_transition(state, state2, 0.4) model.add_transition(state2, state2, 0.4) model.add_transition(state2, state, 0.4) model.add_transition(model.start, state, 0.5) model.add_transition(model.start, state2, 0.5) model.add_transition(state, model.end, 0.2) model.add_transition(state2, model.end, 0.2) model.bake() sequence = model.sample()
fake_back = State(DiscreteDistribution(intron_distribution.p), name='back2') in0 = State(DiscreteDistribution(intron_distribution.p), name='in0') in1 = State(DiscreteDistribution(intron_distribution.p), name='in1') in2 = State(DiscreteDistribution(intron_distribution.p), name='in2') in0_spacers = spacer_states_maker(64, intron_distribution.p, 'in0 spacer') in1_spacers = spacer_states_maker(64, intron_distribution.p, 'in1 spacer') in2_spacers = spacer_states_maker(64, intron_distribution.p, 'in2 spacer') coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') coding_model.add_state(back) coding_model.add_state(fake_back) coding_model.add_state(coding_state0) coding_model.add_state(coding_state1) coding_model.add_state(coding_state2) coding_model.add_state(in0) coding_model.add_state(in1) coding_model.add_state(in2) coding_model.add_state(exon3_state) add_sequence(coding_model, poly_a_states) add_sequence(coding_model, post_poly_spacer) add_sequence(coding_model, in0_spacers) add_sequence(coding_model, in1_spacers)
cat_states = sequence_state_factory(cat_data, 'CAT') post_cat_var_spacers_tss = spacer_states_maker(151, no_coding.p, 'post cat var spacer tss') post_cat_spacers_tss = spacer_states_maker(42, no_coding.p, 'post cat spacer tss') post_cat_var_spacers_tata = spacer_states_maker(151, no_coding.p, 'post cat var spacer tata') post_cat_spacers_tata = spacer_states_maker(22, no_coding.p, 'post cat spacer tata') tata_states = sequence_state_factory(tata_data, 'tata') post_tata_var_spacers = spacer_states_maker(16, no_coding.p, 'post_tata_var_spacer') post_tata_spacers = spacer_states_maker(4, no_coding.p, 'post_tata_spacer') inr_states = sequence_state_factory(inr_data, 'inr') no_inr_states = sequence_state_factory(no_inr_data, 'no inr') # Add States promoter_utr_model.add_state(back) # Add Sequences #GC add_sequence(promoter_utr_model, gc_states) add_sequence(promoter_utr_model, post_gc_spacers_tata) add_variable_length_sequence(promoter_utr_model, post_gc_var_spacers_tata, post_gc_spacers_tata[0]) add_sequence(promoter_utr_model, post_gc_spacers_tss) add_variable_length_sequence(promoter_utr_model, post_gc_var_spacers_tss, post_gc_spacers_tss[0]) add_sequence(promoter_utr_model, inr_states)
mdd_states_sequences = [] for index, l_em in enumerate(leaf_emissions): sequence = state_sequence_from(l_em, 'donor_' + str(index)) add_sequence(hm_model, sequence[0]) set_transition_probabilities(hm_model, sequence[0], sequence[1]) mdd_states_sequences.append(sequence[0]) background = State(DiscreteDistribution({ 'a': 0.25, 'c': 0.25, 'g': 0.25, 't': 0.25 }), name='background') hm_model.add_state(background) hm_model.add_transition(hm_model.start, background, 0.9) fork_sequence(hm_model, [hm_model.start], mdd_states_sequences, [0.025, 0.025, 0.025, 0.025]) hm_model.add_transition(background, background, 0.9) fork_sequence(hm_model, [background], mdd_states_sequences, [0.025, 0.025, 0.025, 0.025]) reunify_sequences(hm_model, mdd_states_sequences, background, [1, 1, 1, 1]) hm_model.bake() a = 'a' c = 'c' g = 'g'
from converter_to import converter_to c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt') matrixStop = numpy.array(matrix_from_exa('../data extractors/new_stops.exa')) coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') post = State(DiscreteDistribution(equal_distribution), name='post') model = HiddenMarkovModel('coding_to_stop') stop_data = classify(matrixStop, 2) stop_states = sequence_state_factory(stop_data, 'stop') model.add_state(coding_state0) model.add_state(coding_state1) model.add_state(coding_state2) add_sequence(model, stop_states) model.add_state(post) model.add_transition(model.start, coding_state1, 1) model.add_transition(coding_state0, coding_state1, 1) model.add_transition(coding_state1, coding_state2, 1) model.add_transition(coding_state2, coding_state0, 0.6) model.add_transition(coding_state2, stop_states[0], 0.4) model.add_transition(stop_states[-1], post, 1) model.add_transition(post, post, 0.9) model.add_transition(post, model.end, 0.1)
def crop_type_hmm_model(nn_pobability_matrix, timeseries_steps, n_observed_classes): # 0 1 2 3 4 5 [ 'unknown_plant', 'large_grass', 'small_grass', 'other', 'fallow', 'no_crop' ] d0 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=0, n_samples=timeseries_steps, n_classes=n_observed_classes) d1 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=1, n_samples=timeseries_steps, n_classes=n_observed_classes) d2 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=2, n_samples=timeseries_steps, n_classes=n_observed_classes) d3 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=3, n_samples=timeseries_steps, n_classes=n_observed_classes) d4 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=4, n_samples=timeseries_steps, n_classes=n_observed_classes) d5 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=5, n_samples=timeseries_steps, n_classes=n_observed_classes) s0_unk = State(d0, name='unknown_plant') s1_large = State(d1, name='large_grass') s2_small = State(d2, name='small_grass') s3_other = State(d3, name='other') s4_fallow = State(d4, name='fallow') s5_none = State(d5, name='no_crop') model = HiddenMarkovModel() # Initialize each hidden state. # All states have an equal chance of being the starting state. for s in [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none]: model.add_state(s) model.add_transition(model.start, s, 1) model.add_transitions( s0_unk, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [95., 0., 0., 0., 0., 5.]) model.add_transitions( s1_large, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [0., 95., 0., 0., 0., 5.]) model.add_transitions( s2_small, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [0., 0., 95., 0., 0., 5.]) model.add_transitions( s3_other, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [0., 0., 0., 95., 0., 5.]) model.add_transitions( s4_fallow, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [0., 0., 0., 0., 95., 5.]) model.add_transitions( s5_none, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [2., 2., 2., 2., 2., 90.]) model.bake(verbose=False) return model
def train_and_test(): with open('../data extractors/exons_start_1.txt') as in_file: total = [] for line in in_file: no_p_line = line.replace('P', '').lower().replace('\n', '') total.append(no_p_line) converted_total = [converter_to(x, 2) for x in total] matrixDonor0 = numpy.array( matrix_from_exa('../data extractors/new_donor1.exa')) c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt') print(c0.p, c1.p, c2.p) coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') donor0_data = classify(matrixDonor0, 2) donor0_states = sequence_state_factory(donor0_data, 'donor0') post = State(DiscreteDistribution(equal_distribution), name='post') model = HiddenMarkovModel('coding to donor') model.add_state(coding_state0) model.add_state(coding_state1) model.add_state(coding_state2) add_sequence(model, donor0_states) model.add_state(post) model.add_transition(model.start, coding_state0, 1) model.add_transition(coding_state0, coding_state1, 0.6) model.add_transition(coding_state0, donor0_states[0], 0.4) model.add_transition(coding_state1, coding_state2, 0.6) model.add_transition(coding_state1, donor0_states[0], 0.4) model.add_transition(coding_state2, coding_state0, 0.6) model.add_transition(coding_state2, donor0_states[0], 0.4) model.add_transition(donor0_states[-1], post, 1) model.add_transition(post, post, 0.9) model.add_transition(post, model.end, 0.1) model.bake() test_model(model) model.fit(converted_total, transition_pseudocount=1, emission_pseudocount=1, verbose=True) test_model(model) with open('partial_model_coding_to_donor_model0.json', 'w') as out: out.write(model.to_json())
def crop_status_hmm_model(nn_pobability_matrix, timeseries_steps, n_observed_classes): # 0 1 2 3 4 5 ['emergence', 'growth', 'flowers', 'senescing', 'senesced', 'no_crop'] d0 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=0, n_samples=timeseries_steps, n_classes=n_observed_classes) d1 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=1, n_samples=timeseries_steps, n_classes=n_observed_classes) d2 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=2, n_samples=timeseries_steps, n_classes=n_observed_classes) d3 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=3, n_samples=timeseries_steps, n_classes=n_observed_classes) d4 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=4, n_samples=timeseries_steps, n_classes=n_observed_classes) d5 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=5, n_samples=timeseries_steps, n_classes=n_observed_classes) s0_emerge = State(d0, name='emergence') s1_growth = State(d1, name='growth') s2_fls = State(d2, name='flowers') s3_sencing = State(d3, name='senescing') s4_senced = State(d4, name='senesced') s5_none = State(d5, name='no_crop') model = HiddenMarkovModel() # Initialize each hidden state. # All states have an equal chance of being the starting state. for s in [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none]: model.add_state(s) model.add_transition(model.start, s, 1) model.add_transitions( s0_emerge, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [90., 5., 0., 0., 0., 5.]) model.add_transitions( s1_growth, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [0., 90., 2.5, 2.5, 0., 5.]) model.add_transitions( s2_fls, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [0., 0., 90., 5., 0., 5.]) model.add_transitions( s3_sencing, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [0., 0., 0., 90., 5., 5.]) model.add_transitions( s4_senced, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [0., 0., 0., 0., 90., 10.]) model.add_transitions( s5_none, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [10., 0, 0., 0., 0., 90.]) model.bake(verbose=False) return model
basic_model = HiddenMarkovModel(name="base-hmm-tagger") states = {} for tag in data.tagset: emission_prob = {} for word, number in emission_counts[tag].items(): emission_prob[word] = number / tag_unigrams[tag] tag_distribution = DiscreteDistribution(emission_prob) state = State(tag_distribution, name=tag) states[tag] = state basic_model.add_state(state) for tag in data.tagset: state = states[tag] start_probability = tag_starts[tag] / sum(tag_starts.values()) basic_model.add_transition(basic_model.start, state, start_probability) end_probability = tag_ends[tag] / sum(tag_ends.values()) basic_model.add_transition(state, basic_model.end, end_probability) for tag1 in data.tagset: state_1 = states[tag1] for tag2 in data.tagset:
def dominant_cover_hmm_model(nn_pobability_matrix, timeseries_steps, n_observed_classes): d0 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=0, n_samples=timeseries_steps, n_classes=n_observed_classes) d1 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=1, n_samples=timeseries_steps, n_classes=n_observed_classes) d2 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=2, n_samples=timeseries_steps, n_classes=n_observed_classes) d3 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=3, n_samples=timeseries_steps, n_classes=n_observed_classes) d4 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=4, n_samples=timeseries_steps, n_classes=n_observed_classes) s0_veg = State(d0, name='vegetation') s1_residue = State(d1, name='residue') s2_soil = State(d2, name='soil') s3_snow = State(d3, name='snow') s4_water = State(d4, name='water') model = HiddenMarkovModel() # Initialize each hidden state. # All states have an equal chance of being the starting state. for s in [s0_veg, s1_residue, s2_soil, s3_snow, s4_water]: model.add_state(s) model.add_transition(model.start, s, 1) model.add_transitions(s0_veg, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [95., 1.0, 1.0, 1.0, 1.0]) model.add_transitions(s1_residue, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [1.0, 95., 1.0, 1.0, 1.0]) model.add_transitions(s2_soil, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [1.0, 1.0, 95., 1.0, 1.0]) model.add_transitions(s3_snow, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [1.0, 1.0, 1.0, 95., 1.0]) model.add_transitions(s4_water, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [1.0, 1.0, 1.0, 1.0, 95.]) model.bake(verbose=False) return model
from model_maker_utils import add_sequence from model_maker_utils import equal_distribution from matrix_from_aln import matrix_from_exa matrixAcceptor0 = numpy.array(matrix_from_exa('new_acceptor1.exa')) acceptor0_data = classify(matrixAcceptor0, 2) model = HiddenMarkovModel('intron_acceptor') intron = State(DiscreteDistribution( calculator.intron_calculator('cuts_intron.txt').p), name='in') acceptor0_states = sequence_state_factory(acceptor0_data, 'acceptor0') post = State(DiscreteDistribution(equal_distribution), name='post') model.add_state(intron) add_sequence(model, acceptor0_states) model.add_state(post) model.add_transition(model.start, intron, 1) model.add_transition(intron, intron, 0.9) model.add_transition(intron, acceptor0_states[0], 0.1) model.add_transition(acceptor0_states[-1], post, 1) model.add_transition(post, post, 0.5) model.add_transition(post, model.end, 0.5) model.bake() test_l = 'GTAACACTGAATACTCAGGAACAATTAATGGATGGTAACATATGAGGAATATCTAGGAGGCACACCCTCTCTGGCATCTATGATGGGCCAAAAACCCGCATTCGCTTGGCCACAGTATGTGAAATATAACCCAGCTTAGACACAGGGTGCGGCAGCTGTCATGTTTCTCTGTGTGTGCCGAGTGTCATGTCTGCACCGTACAGGGATAGCTGAGTCTTCATCCTCCTCAGCTCCTATCTGTCCAGTGCAATGAACAGCAGCTGCTCTCTTCCTCTCTGGTTCCCATGGCAGCCATGCTCTGTTGCAGAGAGAACAGGATTGCATGTTCCCTCTTAATGGGAACGTCCATTTTGCTTTCTGGGACCACTCTCTTAATGCCGCCTGTCAAAACCAGCTAGGACTCCCTGGGGTCCAATCCCTCTGTGTTTAATCTTCTGTCATCTCTGTCCCACCTGGCTCATCAGGGAGATGCAGAAGGCTGAAGAAAAGGAAGTCCCTGAGGACTCACTGGAGGAATGTGCCATCACTTGTTCAAATAGCCATGGCCCTTATGACTCCAACCATGACTCCAACC' converted = converter_to(test_l.lower().replace(' ', '').replace('p', '')) #logp, path = model.viterbi(converted)
oks += 1 else: not_ok += 1 print(oks / (oks + not_ok)) back = State(DiscreteDistribution(equal_distribution), name='back') back2 = State(DiscreteDistribution(equal_distribution), name='back2') matrixZE = numpy.array(matrix_from_exa('../data extractors/starts.exa')) start_states_data = classify(matrixZE, 2) start_states = sequence_state_factory(start_states_data, 'start zone') model = HiddenMarkovModel() model.add_state(back) model.add_state(back2) add_sequence(model, start_states) model.add_transition(model.start, back, 1) model.add_transition(back, back, 0.55) model.add_transition(back, start_states[0], 0.45) model.add_transition(start_states[-1], back2, 1) model.add_transition(back2, back2, 0.5) model.bake() def train_and_test(): test(model)
tag_starts = starting_counts(data.training_set.Y) # Calculate the count of each tag ending a sequence tag_ends = ending_counts(data.training_set.Y) basic_model = HiddenMarkovModel(name="base-hmm-tagger") # Create states with emission probability distributions P(word | tag) and add to the model tag_states = {} for tag in data.training_set.tagset: tag_emissions = DiscreteDistribution({ word: emission_counts[tag][word] / tag_unigrams[tag] for word in emission_counts[tag] }) tag_states[tag] = State(tag_emissions, name=tag) basic_model.add_state(tag_states[tag]) # Add edges between states for the observed transition frequencies P(tag_i | tag_i-1) for tag in data.training_set.tagset: basic_model.add_transition(basic_model.start, tag_states[tag], tag_starts[tag] / tag_unigrams[tag]) for tag1 in data.training_set.tagset: basic_model.add_transition( tag_states[tag], tag_states[tag1], tag_bigrams[(tag, tag1)] / tag_unigrams[tag]) basic_model.add_transition(tag_states[tag], basic_model.end, tag_ends[tag] / tag_unigrams[tag]) # finalize the model basic_model.bake()
acceptor0_data = classify(matrixAcceptor0, 2) no_coding_dist = calculator.intron_calculator('cuts_intron.txt').p donor_states = sequence_state_factory(donor0_data, 'donor0') acceptor_states = sequence_state_factory(acceptor0_data, 'acceptor0') intron_spacer_states = spacer_states_maker(10, no_coding_dist, 'intron spacer') utr_model = HiddenMarkovModel('utr_model') # States exon_state = State(DiscreteDistribution(calculator.utr_exon_5('mcutsa.txt').p), name='utr exon') intron_state = State(DiscreteDistribution(no_coding_dist), name='utr intron') utr_model.add_model(promoter_model) utr_model.add_state(exon_state) utr_model.add_state(intron_state) add_sequence(utr_model, donor_states) add_sequence(utr_model, acceptor_states) add_sequence(utr_model, intron_spacer_states) utr_model.add_transition(utr_model.start, get_state(promoter_model, 'back'), 1) utr_model.add_transition(get_state(promoter_model, 'inr7'), exon_state, 1) utr_model.add_transition(get_state(promoter_model, 'no inr7'), exon_state, 1) utr_model.add_transition(exon_state, exon_state, 0.7) utr_model.add_transition(exon_state, donor_states[0], 0.2) utr_model.add_transition(exon_state, utr_model.end, 0.1) utr_model.add_transition(donor_states[-1], intron_state, 1)