def calculate_final_probability(ancestor, descendant, final_probs, prob_dict, end_dict): three_number_dict = {} three_number_summaries = [ line.strip().split() for line in open('quick_out') ] for line in three_number_summaries: if line[1] not in three_number_dict: three_number_dict[line[1]] = {} three_number_dict[line[1]][line[0]] = line[2:] eightmer_list = custom.count_in_base('AAAAAAAA', 4, 'ACGTz') for eightmer in eightmer_list: if eightmer not in final_probs[ancestor][descendant]: final_probs[ancestor][descendant][eightmer] = [[], [0.0]] for eightmer in eightmer_list: one_offer_list = list_one_offers(eightmer) for one_offer in one_offer_list: change_prob = (eightmer_prob(eightmer, one_offer, prob_dict, end_dict)) / 2 final_probs[ancestor][descendant][eightmer][1][0] += change_prob final_probs[ancestor][descendant][one_offer][0].append( [change_prob, int(three_number_dict[ancestor][eightmer][0])]) final_probs[ancestor][descendant][eightmer][1].append( int(three_number_dict[ancestor][eightmer][0])) for eightmer in eightmer_list: expected_gains, gain_substrates = 0, 0 for gain_pair in final_probs[ancestor][descendant][eightmer][0]: expected_gains += gain_pair[0] * gain_pair[1] gain_substrates += gain_pair[1] avg_gain_prob = expected_gains / gain_substrates gain_pair = map(str, [avg_gain_prob, gain_substrates]) loss_pair = final_probs[ancestor][descendant][eightmer][1] expected_gains = str(expected_gains) expected_losses = str(loss_pair[0] * loss_pair[1]) loss_pair = map(str, loss_pair) output_file.write( eightmer + '\t' + '\t'.join(three_number_dict[descendant][eightmer][1:])) output_file.write('\t' + expected_gains + '\t' + expected_losses + '\t') output_file.write('\t'.join(gain_pair) + '\t' + '\t'.join(loss_pair) + '\n')
def calculate_final_probability(ancestor, descendant, final_probs, prob_dict, end_dict): three_number_dict = {} three_number_summaries = [ line.strip().split() for line in open('quick_out') ] for line in three_number_summaries: if line[1] not in three_number_dict: three_number_dict[line[1]] = {} three_number_dict[line[1]][line[0]] = line[2:] eightmer_list = custom.count_in_base('AAAAAAAA', 4, 'ACGTz') for eightmer in eightmer_list: if eightmer not in final_probs[ancestor][descendant]: final_probs[ancestor][descendant][eightmer] = [[], [0.0]] for eightmer in eightmer_list: one_offer_list = list_one_offers(eightmer) for one_offer in one_offer_list: change_prob = eightmer_prob(eightmer, one_offer, prob_dict, end_dict) final_probs[ancestor][descendant][eightmer][1][0] += change_prob final_probs[ancestor][descendant][one_offer][0].append( [change_prob, int(three_number_dict[ancestor][eightmer][0])]) final_probs[ancestor][descendant][eightmer][1].append( int(three_number_dict[ancestor][eightmer][0])) for eightmer in eightmer_list: expected_gains, gain_substrates = 0, 0 for gain_pair in final_probs[ancestor][descendant][eightmer][0]: expected_gains += gain_pair[0] * gain_pair[1] gain_substrates += gain_pair[1] avg_gain_prob = expected_gains / gain_substrates gain_pair = [avg_gain_prob, gain_substrates] loss_pair = final_probs[ancestor][descendant][eightmer][1] expected_losses = loss_pair[0] * loss_pair[1] print eightmer print 'actual gains, losses:', three_number_dict[descendant][eightmer][ 1:] print 'expected gains, losses:', expected_gains, expected_losses print 'gain prob and substrates, loss prob and substrates:', gain_pair, loss_pair return final_probs
def end_probs(prob_dict): dinucleotides = custom.count_in_base('AA', 4, 'ACGTz') choices = 'ACGT' empty_dict = {} for anc in choices: if anc not in empty_dict: empty_dict[anc] = {} for desc in choices: if desc not in empty_dict[anc]: empty_dict[anc][desc] = {'left': [], 'right': []} end_probs = copy.deepcopy(empty_dict) for ancestor in dinucleotides: end_dict = copy.deepcopy(empty_dict) for descendant in dinucleotides: if ancestor[0] == descendant[0] or ancestor[1] == descendant[1]: end_dict[ancestor[1]][descendant[1]]['left'].append( prob_dict[ancestor][descendant]) end_dict[ancestor[0]][descendant[0]]['right'].append( prob_dict[ancestor][descendant]) for ancestor_letter in end_dict: for descendant_letter in end_dict[ancestor_letter]: for side in end_dict[ancestor_letter][descendant_letter]: if len(end_dict[ancestor_letter][descendant_letter] [side]) >= 1: end_probs[ancestor_letter][descendant_letter][ side].append( sum(end_dict[ancestor_letter] [descendant_letter][side])) for ancestor_letter in end_probs: for descendant_letter in end_probs[ancestor_letter]: for side in end_probs[ancestor_letter][descendant_letter]: current_list = end_probs[ancestor_letter][descendant_letter][ side] end_probs[ancestor_letter][descendant_letter][side] = sum( current_list) / len(current_list) return end_probs
import custom eightmer_list=custom.count_in_base('AAAAAAAA', 4, 'ACGTz') status_list=[line.strip().split('\t') for line in open('enrich_depletion_results_lumped_vert_seeds')] status_list=status_list[1:] real_mirs={} def twomer_counts(eightmer): eightmer_dict={} for letter_number in range(7): twomer=eightmer[letter_number:letter_number+2] if twomer not in eightmer_dict: eightmer_dict[twomer]=0 eightmer_dict[twomer]+=1 return eightmer_dict for line in status_list: joined_line=''.join(line) if len(line)>4 and ('hsa' in joined_line or 'ptr' in joined_line or 'ggo' in joined_line or 'ppy' in joined_line): real_mirs[line[1][1:-1]]=[] print len(real_mirs) for real_mir in real_mirs: real_dict=twomer_counts(real_mir) for eightmer in eightmer_list: eightmer_dict={} test_dict=twomer_counts(eightmer) if test_dict==real_dict and eightmer[1:-1] not in real_mirs: real_mirs[real_mir].append(eightmer) print real_mir, len(real_mirs[real_mir])
'3': '2', 'gorilla': '2', '2': '1', 'orangutan': '1', 'gibbon': '1' } descendant_dict = { '1': ['gibbon', 'orangutan', '2'], '2': ['3', 'gorilla'], '3': ['chimp', 'human'] } gene_dict = cPickle.load(open('final_utr_dictionary_nogaps')) species_list = ['3', '2', '1'] good_list = ['A', 'C', 'G', 'T', 'a', 'c', 'g', 't'] probability_dict = {} dinucleotides = custom.count_in_base('AA', 4, 'ACGTz') prob_dict, final_probs = {}, {} for ancestor_di in dinucleotides: prob_dict[ancestor_di] = {} for descendant_di in dinucleotides: prob_dict[ancestor_di][descendant_di] = 0.0 for ancestor in species_list: if ancestor not in final_probs: final_probs[ancestor] = {} descendants = descendant_dict[ancestor] for descendant in descendants: if descendant not in final_probs[ancestor]: final_probs[ancestor][descendant] = {} mutation_dict = {} for gene in gene_dict: for letter_number, junk in enumerate(
import custom import copy import cPickle import time change_coords = ((0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (0, 2), (0, 3)) patterns = set(custom.count_in_base('00000', 2, '01z')) patterns.discard('00000') patterns.discard('11111') def populate_step(current_step, previous_step_dict, previous_ambig_set, patterns): current_step_dict, ambig_set = {}, set([]) patterns_copy = copy.deepcopy(patterns) print len(patterns), current_step for pattern_number, pattern in enumerate(patterns): if pattern_number % 200000 == 0: print pattern_number / float(len(patterns)) for change in change_coords: types = set(pattern[change[0]:change[1]]) if len(types) == 1: if '1' in types: addition = (change[1] - change[0]) * '0' change_new = change + ('g', 'g') elif '0' in types: addition = (change[1] - change[0]) * '1' change_new = change + ('l', 'l') new_pattern = pattern[:change[0]] + addition + pattern[ change[1]:] if current_step == 1: