def calculate_final_probability(ancestor, descendant, final_probs, prob_dict,
                                end_dict):
    three_number_dict = {}
    three_number_summaries = [
        line.strip().split() for line in open('quick_out')
    ]
    for line in three_number_summaries:
        if line[1] not in three_number_dict:
            three_number_dict[line[1]] = {}
        three_number_dict[line[1]][line[0]] = line[2:]
    eightmer_list = custom.count_in_base('AAAAAAAA', 4, 'ACGTz')
    for eightmer in eightmer_list:
        if eightmer not in final_probs[ancestor][descendant]:
            final_probs[ancestor][descendant][eightmer] = [[], [0.0]]
    for eightmer in eightmer_list:
        one_offer_list = list_one_offers(eightmer)
        for one_offer in one_offer_list:
            change_prob = (eightmer_prob(eightmer, one_offer, prob_dict,
                                         end_dict)) / 2
            final_probs[ancestor][descendant][eightmer][1][0] += change_prob
            final_probs[ancestor][descendant][one_offer][0].append(
                [change_prob,
                 int(three_number_dict[ancestor][eightmer][0])])
        final_probs[ancestor][descendant][eightmer][1].append(
            int(three_number_dict[ancestor][eightmer][0]))
    for eightmer in eightmer_list:
        expected_gains, gain_substrates = 0, 0
        for gain_pair in final_probs[ancestor][descendant][eightmer][0]:
            expected_gains += gain_pair[0] * gain_pair[1]
            gain_substrates += gain_pair[1]
        avg_gain_prob = expected_gains / gain_substrates
        gain_pair = map(str, [avg_gain_prob, gain_substrates])
        loss_pair = final_probs[ancestor][descendant][eightmer][1]
        expected_gains = str(expected_gains)
        expected_losses = str(loss_pair[0] * loss_pair[1])
        loss_pair = map(str, loss_pair)
        output_file.write(
            eightmer + '\t' +
            '\t'.join(three_number_dict[descendant][eightmer][1:]))
        output_file.write('\t' + expected_gains + '\t' + expected_losses +
                          '\t')
        output_file.write('\t'.join(gain_pair) + '\t' + '\t'.join(loss_pair) +
                          '\n')
def calculate_final_probability(ancestor, descendant, final_probs, prob_dict,
                                end_dict):
    three_number_dict = {}
    three_number_summaries = [
        line.strip().split() for line in open('quick_out')
    ]
    for line in three_number_summaries:
        if line[1] not in three_number_dict:
            three_number_dict[line[1]] = {}
        three_number_dict[line[1]][line[0]] = line[2:]
    eightmer_list = custom.count_in_base('AAAAAAAA', 4, 'ACGTz')
    for eightmer in eightmer_list:
        if eightmer not in final_probs[ancestor][descendant]:
            final_probs[ancestor][descendant][eightmer] = [[], [0.0]]
    for eightmer in eightmer_list:
        one_offer_list = list_one_offers(eightmer)
        for one_offer in one_offer_list:
            change_prob = eightmer_prob(eightmer, one_offer, prob_dict,
                                        end_dict)
            final_probs[ancestor][descendant][eightmer][1][0] += change_prob
            final_probs[ancestor][descendant][one_offer][0].append(
                [change_prob,
                 int(three_number_dict[ancestor][eightmer][0])])
        final_probs[ancestor][descendant][eightmer][1].append(
            int(three_number_dict[ancestor][eightmer][0]))
    for eightmer in eightmer_list:
        expected_gains, gain_substrates = 0, 0
        for gain_pair in final_probs[ancestor][descendant][eightmer][0]:
            expected_gains += gain_pair[0] * gain_pair[1]
            gain_substrates += gain_pair[1]
        avg_gain_prob = expected_gains / gain_substrates
        gain_pair = [avg_gain_prob, gain_substrates]
        loss_pair = final_probs[ancestor][descendant][eightmer][1]
        expected_losses = loss_pair[0] * loss_pair[1]
        print eightmer
        print 'actual gains, losses:', three_number_dict[descendant][eightmer][
            1:]
        print 'expected gains, losses:', expected_gains, expected_losses
        print 'gain prob and substrates, loss prob and substrates:', gain_pair, loss_pair
    return final_probs
def end_probs(prob_dict):
    dinucleotides = custom.count_in_base('AA', 4, 'ACGTz')
    choices = 'ACGT'
    empty_dict = {}
    for anc in choices:
        if anc not in empty_dict:
            empty_dict[anc] = {}
        for desc in choices:
            if desc not in empty_dict[anc]:
                empty_dict[anc][desc] = {'left': [], 'right': []}
    end_probs = copy.deepcopy(empty_dict)
    for ancestor in dinucleotides:
        end_dict = copy.deepcopy(empty_dict)
        for descendant in dinucleotides:
            if ancestor[0] == descendant[0] or ancestor[1] == descendant[1]:
                end_dict[ancestor[1]][descendant[1]]['left'].append(
                    prob_dict[ancestor][descendant])
                end_dict[ancestor[0]][descendant[0]]['right'].append(
                    prob_dict[ancestor][descendant])
        for ancestor_letter in end_dict:
            for descendant_letter in end_dict[ancestor_letter]:
                for side in end_dict[ancestor_letter][descendant_letter]:
                    if len(end_dict[ancestor_letter][descendant_letter]
                           [side]) >= 1:
                        end_probs[ancestor_letter][descendant_letter][
                            side].append(
                                sum(end_dict[ancestor_letter]
                                    [descendant_letter][side]))
    for ancestor_letter in end_probs:
        for descendant_letter in end_probs[ancestor_letter]:
            for side in end_probs[ancestor_letter][descendant_letter]:
                current_list = end_probs[ancestor_letter][descendant_letter][
                    side]
                end_probs[ancestor_letter][descendant_letter][side] = sum(
                    current_list) / len(current_list)
    return end_probs
示例#4
0
import custom
eightmer_list=custom.count_in_base('AAAAAAAA', 4, 'ACGTz')
status_list=[line.strip().split('\t') for line in open('enrich_depletion_results_lumped_vert_seeds')]
status_list=status_list[1:]
real_mirs={}

def twomer_counts(eightmer):
	eightmer_dict={}
	for letter_number in range(7):
		twomer=eightmer[letter_number:letter_number+2]
		if twomer not in eightmer_dict:
			eightmer_dict[twomer]=0
		eightmer_dict[twomer]+=1
	return eightmer_dict


for line in status_list:
	joined_line=''.join(line)
	if len(line)>4 and ('hsa' in joined_line or 'ptr' in joined_line or 'ggo' in joined_line or 'ppy' in joined_line):
		real_mirs[line[1][1:-1]]=[]
print len(real_mirs)

for real_mir in real_mirs:
	real_dict=twomer_counts(real_mir)
	for eightmer in eightmer_list:
		eightmer_dict={}
		test_dict=twomer_counts(eightmer)
		if test_dict==real_dict and eightmer[1:-1] not in real_mirs:
			real_mirs[real_mir].append(eightmer)
	print real_mir, len(real_mirs[real_mir])
    '3': '2',
    'gorilla': '2',
    '2': '1',
    'orangutan': '1',
    'gibbon': '1'
}
descendant_dict = {
    '1': ['gibbon', 'orangutan', '2'],
    '2': ['3', 'gorilla'],
    '3': ['chimp', 'human']
}
gene_dict = cPickle.load(open('final_utr_dictionary_nogaps'))
species_list = ['3', '2', '1']
good_list = ['A', 'C', 'G', 'T', 'a', 'c', 'g', 't']
probability_dict = {}
dinucleotides = custom.count_in_base('AA', 4, 'ACGTz')
prob_dict, final_probs = {}, {}
for ancestor_di in dinucleotides:
    prob_dict[ancestor_di] = {}
    for descendant_di in dinucleotides:
        prob_dict[ancestor_di][descendant_di] = 0.0
for ancestor in species_list:
    if ancestor not in final_probs:
        final_probs[ancestor] = {}
    descendants = descendant_dict[ancestor]
    for descendant in descendants:
        if descendant not in final_probs[ancestor]:
            final_probs[ancestor][descendant] = {}
        mutation_dict = {}
        for gene in gene_dict:
            for letter_number, junk in enumerate(
示例#6
0
import custom
import copy
import cPickle
import time

change_coords = ((0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (0, 2), (0, 3))
patterns = set(custom.count_in_base('00000', 2, '01z'))
patterns.discard('00000')
patterns.discard('11111')


def populate_step(current_step, previous_step_dict, previous_ambig_set,
                  patterns):
    current_step_dict, ambig_set = {}, set([])
    patterns_copy = copy.deepcopy(patterns)
    print len(patterns), current_step
    for pattern_number, pattern in enumerate(patterns):
        if pattern_number % 200000 == 0:
            print pattern_number / float(len(patterns))
        for change in change_coords:
            types = set(pattern[change[0]:change[1]])
            if len(types) == 1:
                if '1' in types:
                    addition = (change[1] - change[0]) * '0'
                    change_new = change + ('g', 'g')
                elif '0' in types:
                    addition = (change[1] - change[0]) * '1'
                    change_new = change + ('l', 'l')
                new_pattern = pattern[:change[0]] + addition + pattern[
                    change[1]:]
                if current_step == 1: