num_bases, seqs, ids, index = stempy.read_sequences(fasta.encode(sys.stdin.encoding or 'ascii'), options) occs = stempy.occurrences_from_index(index) freqs = stempy.ZeroOrderFrequencies(list(occs[:4])) freqs_with_pseudo_counts = freqs.add_pseudo_counts(options.back_dist_prior) data = stempy.Data(index, max_W=W) # # Initialise the background # markov_model_create_fn = getattr(stempy, 'create_markov_model_order_from_index_%d' % options.bg_model_order) bg_model_create_fn = getattr(stempy, 'create_bg_model_from_Markov_model_%d' % options.bg_model_order) mm, _ = markov_model_create_fn(data.index, options.back_dist_prior) lls = mm.calculate_likelihoods(data) base_LL_bg_model = stempy.create_bg_model_from_base_likelihoods(W, data, lls, freqs_with_pseudo_counts) Markov_bg_model = bg_model_create_fn(W, data, mm, freqs_with_pseudo_counts) # # Create the model # # bg = base_LL_bg_model bg = Markov_bg_model bs = stempy.PssmBindingSiteModel(stempy.initialise_uniform_pssm(W, options.alphabet_size)) model = stempy.Model(data, bs, bg, _lambda=options.lambda_) model.bs.pssm.log_probs.values()[:] = log_pwm model.bs.recalculate()
seq = "ACGTACACAC" data = create_data(seq) logging.info('Creating Markov model.') mm, freqs = stempy.create_markov_model_order_3(data, 1.) logging.info('Calculating likelihoods.') lls = mm.calculate_likelihoods(data) base_probs = map(exp, base_lls(lls[0])) logging.info(', '.join('%.5f' % p for p in base_probs)) assert feq(stempy.W_mer_log_likelihood(lls[0], 0, 1), (seq.count('A')+1.)/(len(seq)+4.)) assert feq(base_probs[1], (seq.count('AC')+1.)/(seq.count('A')+4.)) assert feq(base_probs[2], (seq.count('ACG')+1.)/(seq.count('AC')-1.+4.)) # -1. because last 'AC' has no following character assert feq(base_probs[3], (seq.count('ACGT')+1.)/(seq.count('ACG')+4.)) assert feq(base_probs[4], (seq.count('CGTA')+1.)/(seq.count('CGT')+4.)) # check the bg model from likelihoods bg_model = stempy.create_bg_model_from_base_likelihoods(4, data, lls, freqs) seq = "AAAC" data = create_data(seq) mm, freqs = stempy.create_markov_model_order_3(data, 1.) assert feq(freqs.freq(0), .375) assert feq(freqs.freq(1), .125) assert feq(freqs.freq(2), .125) assert feq(freqs.freq(3), .375) seq = "AAACNNNNNNNTCTCTATACGCAGTACGG" data = create_data(seq) mm, freqs = stempy.create_markov_model_order_3(data, 1.) lls = mm.calculate_likelihoods(data) print ', '.join(map(str, lls)) for i, (x, y) in enumerate(pairs(lls[0])):