def test_pickling(self): model_builder = hmm.pssm.ModelBuilder(3) model = model_builder.create_background_mosaic_model(3, .01, 1.0) model_builder.dump_background_mosaic_model(model, 'model.pickle') model_copy = model_builder.load_background_mosaic_model('model.pickle') converted_model = hmm.model_states_2_model(model) converted_model_copy = hmm.model_states_2_model(model_copy) assert converted_model.A.all() == converted_model_copy.A.all() assert converted_model.B.all() == converted_model_copy.B.all() assert converted_model.pi.all() == converted_model_copy.pi.all()
def test_order_0_states(self): model_builder = hmm.pssm.ModelBuilder(2) model = model_builder.create_background_mosaic_model(3, .01, 1.0) positive = model_builder.add_order_0_parameterised_state(model, emission_dist=[.1,.2,.3,.4]) negative = model_builder.add_order_0_rev_comp_state(model, positive) B = hmm.model_states_2_model(model).B[3:] assert infpy.check_is_close_2(B[0,0], B[1,3]) assert infpy.check_is_close_2(B[0,1], B[1,2]) assert infpy.check_is_close_2(B[0,2], B[1,1]) assert infpy.check_is_close_2(B[0,3], B[1,0])
def test_traits(self): from hmm.pssm import create_background_model, PssmTraits, seq_to_numpy from infpy import check_is_close_2 p_binding_site = .01 num_background_states = 2 emission_dists = [ [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 1., 0., 0.], [1., 0., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.], [0., 0., 0., 1.], [0., 0., 0., 1.], [0., 0., 1., 0.], [0., 1., 0., 0.], [1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 0., 1.], ] K = len(emission_dists) test_seq = 'accagtttgcact' # matches dist above test_seq_order_0 = seq_to_numpy(test_seq) # for various different orders for order in [1, 2]: # build a model of distribution above traits = PssmTraits(K, p_binding_site, order, num_background_states, create_background_model, emission_dists=emission_dists) model = traits.new_model() converted = hmm.model_states_2_model(model) B = converted.B # check the reverse complement states are correct for n in xrange(model.N): for o in xrange(model.M): rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement( n, o) assert check_is_close_2( B[rev_comp_state, rev_comp_obs], B[n, o]), ('%d,%d %d,%d: %f %f' % (rev_comp_state, rev_comp_obs, n, o, B[rev_comp_state, rev_comp_obs], B[n, o])) # check viterbi gives correct result test_seq_order_n = converted.converter.to_order_n(test_seq_order_0) LL, states = converted.viterbi(test_seq_order_n) for i, state in enumerate(states): assert state == num_background_states + i
def test_order_0_states(self): model_builder = hmm.pssm.ModelBuilder(2) model = model_builder.create_background_mosaic_model(3, .01, 1.0) positive = model_builder.add_order_0_parameterised_state( model, emission_dist=[.1, .2, .3, .4]) negative = model_builder.add_order_0_rev_comp_state(model, positive) B = hmm.model_states_2_model(model).B[3:] assert infpy.check_is_close_2(B[0, 0], B[1, 3]) assert infpy.check_is_close_2(B[0, 1], B[1, 2]) assert infpy.check_is_close_2(B[0, 2], B[1, 1]) assert infpy.check_is_close_2(B[0, 3], B[1, 0])
def test_traits(self): from hmm.pssm import create_background_model, PssmTraits, seq_to_numpy from infpy import check_is_close_2 p_binding_site = .01 num_background_states = 2 emission_dists = [ [ 1., 0., 0., 0. ], [ 0., 1., 0., 0. ], [ 0., 1., 0., 0. ], [ 1., 0., 0., 0. ], [ 0., 0., 1., 0. ], [ 0., 0., 0., 1. ], [ 0., 0., 0., 1. ], [ 0., 0., 0., 1. ], [ 0., 0., 1., 0. ], [ 0., 1., 0., 0. ], [ 1., 0., 0., 0. ], [ 0., 1., 0., 0. ], [ 0., 0., 0., 1. ], ] K = len(emission_dists) test_seq = 'accagtttgcact' # matches dist above test_seq_order_0 = seq_to_numpy(test_seq) # for various different orders for order in [1, 2]: # build a model of distribution above traits = PssmTraits(K, p_binding_site, order, num_background_states, create_background_model, emission_dists=emission_dists) model = traits.new_model() converted = hmm.model_states_2_model(model) B = converted.B # check the reverse complement states are correct for n in xrange(model.N): for o in xrange(model.M): rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement(n,o) assert check_is_close_2(B[rev_comp_state,rev_comp_obs], B[n,o]), ('%d,%d %d,%d: %f %f' % (rev_comp_state,rev_comp_obs,n,o,B[rev_comp_state,rev_comp_obs],B[n,o])) # check viterbi gives correct result test_seq_order_n = converted.converter.to_order_n(test_seq_order_0) LL, states = converted.viterbi(test_seq_order_n) for i, state in enumerate(states): assert state == num_background_states+i
def learn_model(self): """ Creates several models, trains them using Baum-Welch and returns the best """ self.logger.info('Learning initial model') # work out the parameters for baum-welch and run it self.known_bases = sum(self.converter.num_known_bases_order_n(seq) for seq in self.order_n_seqs) self.default_tolerance = 1e-6 * self.known_bases # create models models = [ hmm.model_states_2_model(self.pssm_traits.new_model()) for i in xrange(self.num_models) ] LLs = [ (model,self.train_model(model)[0]) for model in models ] # which had the best log likelihood? best = max(LLs, key=lambda x:x[1]) # print best, LLs return best
def learn_model(self): """ Creates several models, trains them using Baum-Welch and returns the best """ self.logger.info('Learning initial model') # work out the parameters for baum-welch and run it self.known_bases = sum( self.converter.num_known_bases_order_n(seq) for seq in self.order_n_seqs) self.default_tolerance = 1e-6 * self.known_bases # create models models = [ hmm.model_states_2_model(self.pssm_traits.new_model()) for i in xrange(self.num_models) ] LLs = [(model, self.train_model(model)[0]) for model in models] # which had the best log likelihood? best = max(LLs, key=lambda x: x[1]) # print best, LLs return best
TRANSITION:4;5;1 TRANSITION:5;6;1 TRANSITION:6;7;1 TRANSITION:7;8;1 # EMISSIONS:i;b1,b2,b3,b4 - where p(state i emits base i) = bi EMISSIONS:0;.5,.5,0,0 EMISSIONS:1;0,0,.5,.5 EMISSIONS:2;1,0,0,0 EMISSIONS:3;0,0,0,1 EMISSIONS:4;0,0,1,0 EMISSIONS:5;0,1,0,0 EMISSIONS:6;1,0,0,0 EMISSIONS:7;1,0,0,0 EMISSIONS:8;0,0,1,0 """ model = build_model( to_parse.split('\n') ) m = hmm.model_states_2_model(model) if False: hmm.graph_as_svg( m, 'test', 'model_io_test', graphing_keywords = { 'show_dists' : lambda l: True, 'state_labels' : None }, neato_properties = { '-Elen' : '2' } ) with open('model.mdl', 'w') as f: write_model(model, f)
[ 0., 0., 1., 0. ], [ 0., 1., 0., 0. ], [ 1., 0., 0., 0. ], [ 0., 1., 0., 0. ], [ 0., 0., 0., 1. ], ] K = len(emission_dists) test_seq = 'accagtttgcact' # matches dist above test_seq_order_0 = seq_to_numpy(test_seq) # for various different orders for order in [0, 1, 2]: # build a model of distribution above traits = PssmTraits(K, p_binding_site, order, num_background_states, create_background_model, emission_dists=emission_dists) model = traits.new_model() converted = hmm.model_states_2_model(model) B = converted.B # check the reverse complement states are correct for n in xrange(model.N): for o in xrange(model.M): rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement(n,o) assert check_is_close_2(B[rev_comp_state,rev_comp_obs], B[n,o]), ('%d,%d %d,%d: %f %f' % (rev_comp_state,rev_comp_obs,n,o,B[rev_comp_state,rev_comp_obs],B[n,o])) # check viterbi gives correct result test_seq_order_n = converted.converter.to_order_n(test_seq_order_0) LL, states = converted.viterbi(test_seq_order_n) for i, state in enumerate(states): assert state == num_background_states+i
except: print 'Could not set process priority' def bw_callback(LL): pass; #print 'LL: %.3f' % LL order = 0 num_mosaics = 1 cache = BackgroundModelCache() print ' fragment order # mosaics LL/base' for fragment in all_fragments: seqs = seqs_for_fragment(fragment) model_builder = hmm.pssm.ModelBuilder(order) training_sequences = [ model_builder.converter.to_order_n(s) for s in seqs ] known_bases = sum(model_builder.converter.num_known_bases_order_n(s) for s in training_sequences) print '%10s %10d %10d' % (fragment, order, num_mosaics), try: model = cache.get_model(order, num_mosaics, fragment) except: model_by_states = model_builder.create_background_mosaic_model(num_mosaics, 0.01, 100.0) model = hmm.model_states_2_model(model_by_states) tolerance = 1e-4 * known_bases model.baum_welch( training_sequences, tolerance = tolerance, callback = bw_callback ) cache.save_model(model, order, num_mosaics, fragment) LL = sum(model.forward(s)[0] for s in training_sequences) print LL/known_bases
TRANSITION:3;5;.9 TRANSITION:4;5;1 TRANSITION:5;6;1 TRANSITION:6;7;1 TRANSITION:7;8;1 # EMISSIONS:i;b1,b2,b3,b4 - where p(state i emits base i) = bi EMISSIONS:0;.5,.5,0,0 EMISSIONS:1;0,0,.5,.5 EMISSIONS:2;1,0,0,0 EMISSIONS:3;0,0,0,1 EMISSIONS:4;0,0,1,0 EMISSIONS:5;0,1,0,0 EMISSIONS:6;1,0,0,0 EMISSIONS:7;1,0,0,0 EMISSIONS:8;0,0,1,0 """ model = build_model(to_parse.split('\n')) m = hmm.model_states_2_model(model) if False: hmm.graph_as_svg(m, 'test', 'model_io_test', graphing_keywords={ 'show_dists': lambda l: True, 'state_labels': None }, neato_properties={'-Elen': '2'}) with open('model.mdl', 'w') as f: write_model(model, f)