def test_hmm(tag, pwm): freqs, gaps = pwm logo = L.pssm_as_image(freqs, size=None, transparencies=gaps) logo_filename = '%s-logo.png' % tag logo.save(logo_filename) logging.info('%s: Created logo: %s', tag, logo_filename) model = build_hmm_model(freqs, gaps, .1) logging.debug('%s: Created model', tag) hmm.graph_as_svg(model, '%s-states' % tag, neato_properties={'-Elen':3.}) logging.debug('%s: Graphed model', tag) return model
def test_hmm(tag, pwm): freqs, gaps = pwm logo = L.pssm_as_image(freqs, size=None, transparencies=gaps) logo_filename = '%s-logo.png' % tag logo.save(logo_filename) logging.info('%s: Created logo: %s', tag, logo_filename) model = build_hmm_model(freqs, gaps, .1) logging.debug('%s: Created model', tag) hmm.graph_as_svg(model, '%s-states' % tag, neato_properties={'-Elen': 3.}) logging.debug('%s: Graphed model', tag) return model
def run_pwm_viterbi(tag, freqs, gaps, positive_seqs, negative_seqs): """ Run the PWM using Viterbi algorithm to classify sequences. """ logging.info('Running PWM: %s', tag) logo = L.pssm_as_image(freqs, size=None, transparencies=gaps) logo_filename = '%s-logo.png' % tag logo.save(logo_filename) logging.info('%s: Created logo: %s', tag, logo_filename) roc_points = [] for p_binding in p_binding_params: # build model model = build_hmm_model(freqs, gaps, p_binding) hmm.graph_as_svg(model, '%s-states' % tag, neato_properties={'-Elen':1.4}) logging.debug('%s: Graphed model', tag) pos_total_pos, pos_total_neg, pos_num_seqs_with_site = run_on_seqs(model, positive_seqs) logging.debug( '%s: p(binding)=%.1e: Positive sequences: Over all sequences: found %4d positive sites and %4d negative sites in %4d/%4d sequences', tag, p_binding, pos_total_pos, pos_total_neg, pos_num_seqs_with_site, len(positive_seqs) ) neg_total_pos, neg_total_neg, neg_num_seqs_with_site = run_on_seqs(model, negative_seqs) logging.debug( '%s: p(binding)=%.1e: Negative sequences: Over all sequences: found %4d positive sites and %4d negative sites in %4d/%4d sequences', tag, p_binding, neg_total_pos, neg_total_neg, neg_num_seqs_with_site, len(negative_seqs) ) tp = pos_num_seqs_with_site fp = neg_num_seqs_with_site fn = len(positive_seqs) - pos_num_seqs_with_site tn = len(negative_seqs) - neg_num_seqs_with_site roc_point = roc.RocCalculator(tp=tp, fp=fp, tn=tn, fn=fn) logging.info('%s: p(binding)=%.1e; Specificity=%.3f; Sensitivity=%.3f', tag, p_binding, roc_point.specificity(), roc_point.sensitivity(), ) roc_points.append(roc_point) return roc_points
def run_pwm_forward_backward(tag, freqs, gaps, positive_seqs, negative_seqs): """ Run the PWM using forward-backward. """ logging.info('Running PWM: %s', tag) logo = L.pssm_as_image(freqs, size=None, transparencies=gaps) logo_filename = '%s-logo.png' % tag logo.save(logo_filename) logging.info('%s: Created logo: %s', tag, logo_filename) # build model model = build_hmm_model(freqs, gaps, .001) hmm.graph_as_svg(model, '%s-states' % tag, neato_properties={'-Elen': 1.4}) logging.debug('%s: Graphed model', tag) positive_scores = test_hmm_forward_backward(model, positive_seqs.values()) negative_scores = test_hmm_forward_backward(model, negative_seqs.values()) return roc.picked_rocs_from_thresholds(positive_scores, negative_scores)
def run_pwm_forward_backward(tag, freqs, gaps, positive_seqs, negative_seqs): """ Run the PWM using forward-backward. """ logging.info('Running PWM: %s', tag) logo = L.pssm_as_image(freqs, size=None, transparencies=gaps) logo_filename = '%s-logo.png' % tag logo.save(logo_filename) logging.info('%s: Created logo: %s', tag, logo_filename) # build model model = build_hmm_model(freqs, gaps, .001) hmm.graph_as_svg(model, '%s-states' % tag, neato_properties={'-Elen':1.4}) logging.debug('%s: Graphed model', tag) positive_scores = test_hmm_forward_backward(model, positive_seqs.values()) negative_scores = test_hmm_forward_backward(model, negative_seqs.values()) return roc.picked_rocs_from_thresholds(positive_scores, negative_scores)
def run_pwm_viterbi(tag, freqs, gaps, positive_seqs, negative_seqs): """ Run the PWM using Viterbi algorithm to classify sequences. """ logging.info('Running PWM: %s', tag) logo = L.pssm_as_image(freqs, size=None, transparencies=gaps) logo_filename = '%s-logo.png' % tag logo.save(logo_filename) logging.info('%s: Created logo: %s', tag, logo_filename) roc_points = [] for p_binding in p_binding_params: # build model model = build_hmm_model(freqs, gaps, p_binding) hmm.graph_as_svg(model, '%s-states' % tag, neato_properties={'-Elen': 1.4}) logging.debug('%s: Graphed model', tag) pos_total_pos, pos_total_neg, pos_num_seqs_with_site = run_on_seqs( model, positive_seqs) logging.debug( '%s: p(binding)=%.1e: Positive sequences: Over all sequences: found %4d positive sites and %4d negative sites in %4d/%4d sequences', tag, p_binding, pos_total_pos, pos_total_neg, pos_num_seqs_with_site, len(positive_seqs)) neg_total_pos, neg_total_neg, neg_num_seqs_with_site = run_on_seqs( model, negative_seqs) logging.debug( '%s: p(binding)=%.1e: Negative sequences: Over all sequences: found %4d positive sites and %4d negative sites in %4d/%4d sequences', tag, p_binding, neg_total_pos, neg_total_neg, neg_num_seqs_with_site, len(negative_seqs)) tp = pos_num_seqs_with_site fp = neg_num_seqs_with_site fn = len(positive_seqs) - pos_num_seqs_with_site tn = len(negative_seqs) - neg_num_seqs_with_site roc_point = roc.RocCalculator(tp=tp, fp=fp, tn=tn, fn=fn) logging.info( '%s: p(binding)=%.1e; Specificity=%.3f; Sensitivity=%.3f', tag, p_binding, roc_point.specificity(), roc_point.sensitivity(), ) roc_points.append(roc_point) return roc_points
TRANSITION:4;5;1 TRANSITION:5;6;1 TRANSITION:6;7;1 TRANSITION:7;8;1 # EMISSIONS:i;b1,b2,b3,b4 - where p(state i emits base i) = bi EMISSIONS:0;.5,.5,0,0 EMISSIONS:1;0,0,.5,.5 EMISSIONS:2;1,0,0,0 EMISSIONS:3;0,0,0,1 EMISSIONS:4;0,0,1,0 EMISSIONS:5;0,1,0,0 EMISSIONS:6;1,0,0,0 EMISSIONS:7;1,0,0,0 EMISSIONS:8;0,0,1,0 """ model = build_model( to_parse.split('\n') ) m = hmm.model_states_2_model(model) if False: hmm.graph_as_svg( m, 'test', 'model_io_test', graphing_keywords = { 'show_dists' : lambda l: True, 'state_labels' : None }, neato_properties = { '-Elen' : '2' } ) with open('model.mdl', 'w') as f: write_model(model, f)
TRANSITION:3;5;.9 TRANSITION:4;5;1 TRANSITION:5;6;1 TRANSITION:6;7;1 TRANSITION:7;8;1 # EMISSIONS:i;b1,b2,b3,b4 - where p(state i emits base i) = bi EMISSIONS:0;.5,.5,0,0 EMISSIONS:1;0,0,.5,.5 EMISSIONS:2;1,0,0,0 EMISSIONS:3;0,0,0,1 EMISSIONS:4;0,0,1,0 EMISSIONS:5;0,1,0,0 EMISSIONS:6;1,0,0,0 EMISSIONS:7;1,0,0,0 EMISSIONS:8;0,0,1,0 """ model = build_model(to_parse.split('\n')) m = hmm.model_states_2_model(model) if False: hmm.graph_as_svg(m, 'test', 'model_io_test', graphing_keywords={ 'show_dists': lambda l: True, 'state_labels': None }, neato_properties={'-Elen': '2'}) with open('model.mdl', 'w') as f: write_model(model, f)
[ 1., 0., 0., 0. ], [ 0., 1., 0., 0. ], [ 0., 0., 0., 1. ], ] K = 7 test_seq = 'acgtgat' # matches dist above test_seq_order_0 = hmm.pssm.seq_to_numpy(test_seq) # for various different orders for order in [0, 1, 2]: # build a model of distribution above traits = GappedPssmTraits(K, p_binding_site, order, num_background_states, create_background_model, emission_dists=emission_dists) model = traits.new_model() converted = hmm.model_states_2_model(model) B = converted.B hmm.graph_as_svg(converted, 'gapped_pssm') # check the reverse complement states are correct for n in xrange(model.N): for o in xrange(model.M): rev_comp_state, rev_comp_obs = traits.get_non_reverse_complement(n,o) assert check_is_close_2(B[rev_comp_state,rev_comp_obs], B[n,o]), ('%d,%d %d,%d: %f %f' % (rev_comp_state,rev_comp_obs,n,o,B[rev_comp_state,rev_comp_obs],B[n,o])) # check viterbi gives correct result test_seq_order_n = converted.converter.to_order_n(test_seq_order_0) LL, states = converted.viterbi(test_seq_order_n) for i, state in enumerate(states): assert (state-num_background_states)/2 == i
hmm.dirichlet_draw(numpy.ones(builder.M) * .1) for k in xrange(builder.K) ]) emissions[builder.gap_index] = hmm.dirichlet_draw( numpy.ones(builder.M) * .3) model_by_states, in_states, out_states = builder.create( p_gap=.6, emissions=emissions) # create a background model and add the single gapped pssm to it complete_model = add_to_simple_background_model(model_by_states, in_states, out_states, p_binding_site=.01) # convert to other type of model model = hmm.as_model(complete_model) # write as a graph hmm.graph_as_svg(model, 'single-gapped-hmm', graphing_keywords={'include_emissions': False}, neato_properties={'-Elen': 2}) # get the emissions and gap probabilities and write a logo emissions_copy, gap_probs = builder.get_emissions_and_gap_probabilities( model, offset=1) assert (emissions_copy - emissions).sum() < 1e-10 import hmm.pssm.logo as logo image = logo.pssm_as_image(emissions, transparencies=gap_probs) image.save("single-gapped-pssm-logo.png", "PNG")
sp1_pssms = all_sp1_pssms() for tag in methods: score_pickle_file = '%s-scores.pickle' % tag try: positive_scores, negative_scores = cPickle.load(open(score_pickle_file)) logging.info('%s: Unpickled ROCs from %s.', tag, score_pickle_file) except: logging.info('%s: Could not ROCs from unpickle %s, calculating from scratch.', tag, score_pickle_file) freqs, gaps = sp1_pssms[tag] freqs = (freqs.T / freqs.sum(axis=1)).T logo = L.pssm_as_image(freqs, size=None, transparencies=gaps) logo_filename = '%s-logo.png' % tag logo.save(logo_filename) logging.info('%s: Created logo: %s', tag, logo_filename) model = build_hmm_model(freqs, gaps, .001) hmm.graph_as_svg(model, '%s-states' % tag, neato_properties={'-Elen':1.4}) logging.debug('%s: Graphed model', tag) positive_scores = test_hmm_forward_backward(model, sequences['positive'].values()) negative_scores = dict( (bg, test_hmm_forward_backward(model, sequences[bg].values())) for bg in backgrounds ) cPickle.dump((positive_scores, negative_scores), open(score_pickle_file, 'wb')) scores[(tag,)] = positive_scores for bg, score in negative_scores.iteritems(): scores[(tag, bg)] = score # # Generate ROCs #
positive_scores, negative_scores = cPickle.load( open(score_pickle_file)) logging.info('%s: Unpickled ROCs from %s.', tag, score_pickle_file) except: logging.info( '%s: Could not ROCs from unpickle %s, calculating from scratch.', tag, score_pickle_file) freqs, gaps = sp1_pssms[tag] freqs = (freqs.T / freqs.sum(axis=1)).T logo = L.pssm_as_image(freqs, size=None, transparencies=gaps) logo_filename = '%s-logo.png' % tag logo.save(logo_filename) logging.info('%s: Created logo: %s', tag, logo_filename) model = build_hmm_model(freqs, gaps, .001) hmm.graph_as_svg(model, '%s-states' % tag, neato_properties={'-Elen': 1.4}) logging.debug('%s: Graphed model', tag) positive_scores = test_hmm_forward_backward( model, sequences['positive'].values()) negative_scores = dict( (bg, test_hmm_forward_backward(model, sequences[bg].values())) for bg in backgrounds) cPickle.dump((positive_scores, negative_scores), open(score_pickle_file, 'wb')) scores[(tag, )] = positive_scores for bg, score in negative_scores.iteritems(): scores[(tag, bg)] = score # # Generate ROCs
emissions[builder.gap_index] = hmm.dirichlet_draw(numpy.ones(builder.M) * .3) model_by_states, in_states, out_states = builder.create( p_gap=.6, emissions=emissions ) # create a background model and add the single gapped pssm to it complete_model = add_to_simple_background_model( model_by_states, in_states, out_states, p_binding_site=.01) # convert to other type of model model = hmm.as_model(complete_model) # write as a graph hmm.graph_as_svg( model, 'single-gapped-hmm', graphing_keywords={'include_emissions':False}, neato_properties={'-Elen':2} ) # get the emissions and gap probabilities and write a logo emissions_copy, gap_probs = builder.get_emissions_and_gap_probabilities(model, offset=1) assert (emissions_copy - emissions).sum() < 1e-10 import hmm.pssm.logo as logo image = logo.pssm_as_image(emissions, transparencies=gap_probs) image.save("single-gapped-pssm-logo.png", "PNG")