def multiset_banded_genotype_combinations(sample_genotypes, bandwidth): for index_combo in multiset.multichoose(len(samples), range(bandwidth)): for index_permutation in multiset.permutations(index_combo): yield [ genotypes[index] for index, genotypes in zip( index_permutation, sample_genotypes) ]
def data_likelihood_exact(genotype, observed_alleles): """'Exact' data likelihood, sum of sampling probability * join Q score for the observed alleles over all possible underlying 'true allele' combinations.""" #print "probability that observations", [o['alt'] for o in observed_alleles], "arise from genotype", genotype observation_count = len(observed_alleles) ploidy = sum([count for allele, count in genotype]) allele_probs = [count / float(ploidy) for allele, count in genotype] probs = [] # for all true allele combinations X permutations for true_allele_combination in multiset.multichoose(observation_count, [x[0] for x in genotype]): for true_allele_permutation in multiset.permutations(true_allele_combination): # this mapping allows us to use sampling_prob the same way as we do when we use JSON allele observation records true_alleles = [{'alt':allele} for allele in true_allele_permutation] allele_groups = group_alleles(true_alleles) observations = [] for allele, count in genotype: if allele_groups.has_key(allele): observations.append(len(allele_groups[allele])) else: observations.append(0) #sprob = dirichlet_maximum_likelihood_ratio(allele_probs, observations) # distribution parameter here lnsampling_prob = multinomialln(allele_probs, observations) prob = lnsampling_prob + likelihood_given_true_alleles(observed_alleles, true_alleles) #print math.exp(prob), sprob, genotype, true_allele_permutation #print genotype, math.exp(prob), sprob, true_allele_permutation, [o['alt'] for o in observed_alleles] probs.append(prob) # sum the individual probability of all combinations p = logsumexp(probs) #print math.exp(p) return p
def data_likelihood_exact(genotype, observed_alleles): """'Exact' data likelihood, sum of sampling probability * join Q score for the observed alleles over all possible underlying 'true allele' combinations.""" #print "probability that observations", [o['alt'] for o in observed_alleles], "arise from genotype", genotype observation_count = len(observed_alleles) ploidy = sum([count for allele, count in genotype]) allele_probs = [count / float(ploidy) for allele, count in genotype] probs = [] # for all true allele combinations X permutations for true_allele_combination in multiset.multichoose( observation_count, [x[0] for x in genotype]): for true_allele_permutation in multiset.permutations( true_allele_combination): # this mapping allows us to use sampling_prob the same way as we do when we use JSON allele observation records true_alleles = [{ 'alt': allele } for allele in true_allele_permutation] allele_groups = group_alleles(true_alleles) observations = [] for allele, count in genotype: if allele_groups.has_key(allele): observations.append(len(allele_groups[allele])) else: observations.append(0) #sprob = dirichlet_maximum_likelihood_ratio(allele_probs, observations) # distribution parameter here lnsampling_prob = multinomialln(allele_probs, observations) prob = lnsampling_prob + likelihood_given_true_alleles( observed_alleles, true_alleles) #print math.exp(prob), sprob, genotype, true_allele_permutation #print genotype, math.exp(prob), sprob, true_allele_permutation, [o['alt'] for o in observed_alleles] probs.append(prob) # sum the individual probability of all combinations p = logsumexp(probs) #print math.exp(p) return p
genotypes) in zip(index_permutation, sample_genotypes)] def genotype_str(genotype): return fold(operator.add, [allele * count for allele, count in genotype]) if __name__ == '__main__': ploidy = 2 # assume ploidy 2 for all individuals and all positions potential_alleles = ['A', 'T', 'G', 'C'] # genotypes are expressed as sets of allele frequencies genotypes = list_genotypes_to_count_genotypes( list(multiset.multichoose(ploidy, potential_alleles))) for line in sys.stdin: position = cjson.decode(line) #print position['position'] samples = position['samples'] position['coverage'] = sum([ len(sample['alleles']) for samplename, sample in samples.iteritems() ]) #potential_alleles = ['A','T','G','C'] potential_alleles = set() for samplename, sample in samples.items(): # only process snps and reference alleles
def multiset_banded_genotype_combinations(sample_genotypes, bandwidth): for index_combo in multiset.multichoose(len(samples), range(bandwidth)): for index_permutation in multiset.permutations(index_combo): yield [genotypes[index] for index, genotypes in zip(index_permutation, sample_genotypes)]
for j in range(1, band_depth): # band_depth is the depth to which we explore the bandwith... TODO explain better indexes = j * [i] + (len(sample_genotypes) - j) * [0] for index_permutation in multiset.permutations(indexes): yield [(sample, genotypes[index]) for index, (sample, genotypes) in zip(index_permutation, sample_genotypes)] def genotype_str(genotype): return reduce(operator.add, [allele * count for allele, count in genotype]) if __name__ == '__main__': ploidy = 2 # assume ploidy 2 for all individuals and all positions potential_alleles = ['A','T','G','C'] # genotypes are expressed as sets of allele frequencies genotypes = list_genotypes_to_count_genotypes(list(multiset.multichoose(ploidy, potential_alleles))) for line in sys.stdin: position = cjson.decode(line) #print position['position'] samples = position['samples'] position['coverage'] = sum([len(sample['alleles']) for samplename, sample in samples.iteritems()]) #potential_alleles = ['A','T','G','C'] potential_alleles = set() for samplename, sample in samples.items(): # only process snps and reference alleles alleles = [allele for allele in sample['alleles'] if allele['type'] in ['reference', 'snp']] alleles = alleles_quality_to_lnprob(alleles) sample['alleles'] = alleles