def get_rate_correlation_windowed(dns, dss, aligned_prots, cdna_dicts, spec_orf_list, xfold_only, xfold_degeneracy): window_size = 1 ns = [] ss = [] # Examine only codons of a certain degeneracy? xfold_ending = 'CTAG' xfold_wrong_aas = '' aligned_cdnas = [] for xi in range(len(spec_orf_list)): (spec, orf) = spec_orf_list[xi] try: aligned_cdnas.append( muscle.align_gene_from_protein(cdna_dicts[spec][orf], aligned_prots[xi])) except KeyError: aligned_cdnas.append( muscle.align_gene_from_protein(cdna_dicts[spec + '-mit'][orf], aligned_prots[xi])) assert len(aligned_prots[xi]) == len(dns) for site in range(0, len(dns), window_size): if xfold_only and (window_size == 1): codons = [ aligned_cdna[3 * site:3 * site + 3] for aligned_cdna in aligned_cdnas ] wrong_degeneracy = False wrong_ending = False wrong_aa = False degs = [codon_degeneracy[codon] for codon in codons] for codon in codons: if codon_degeneracy[codon] != xfold_degeneracy: wrong_degeneracy = True if not codon[2] in xfold_ending: wrong_ending = True if not codon == '---' and translate._genetic_code[ codon] in xfold_wrong_aas: wrong_aa = True if wrong_degeneracy or wrong_ending or wrong_aa: continue # Add up substitutions in window syn = 0 nsyn = 0 valid_sites = False for nsite in range(site, min(len(dns), site + window_size)): (s, n) = (dss[nsite], dns[nsite]) # don't consider cases with missing counts if not (s is None) and not (n is None): syn += s nsyn += n valid_sites = True if valid_sites: ns.append(nsyn) ss.append(syn) return stats.PearsonCorrelation(ns, ss), ns, ss
def get_rate_correlation_windowed(dns, dss, aligned_prots, cdna_dicts, spec_orf_list, xfold_only, xfold_degeneracy): window_size = 1 ns = [] ss = [] # Examine only codons of a certain degeneracy? xfold_ending = 'CTAG' xfold_wrong_aas = '' aligned_cdnas = [] for xi in range(len(spec_orf_list)): (spec,orf) = spec_orf_list[xi] try: aligned_cdnas.append(muscle.align_gene_from_protein(cdna_dicts[spec][orf], aligned_prots[xi])) except KeyError: aligned_cdnas.append(muscle.align_gene_from_protein(cdna_dicts[spec+'-mit'][orf], aligned_prots[xi])) assert len(aligned_prots[xi]) == len(dns) for site in range(0,len(dns),window_size): if xfold_only and (window_size == 1): codons = [aligned_cdna[3*site:3*site+3] for aligned_cdna in aligned_cdnas] wrong_degeneracy = False wrong_ending = False wrong_aa = False degs = [codon_degeneracy[codon] for codon in codons] for codon in codons: if codon_degeneracy[codon] != xfold_degeneracy: wrong_degeneracy = True if not codon[2] in xfold_ending: wrong_ending = True if not codon=='---' and translate._genetic_code[codon] in xfold_wrong_aas: wrong_aa = True if wrong_degeneracy or wrong_ending or wrong_aa: continue # Add up substitutions in window syn = 0 nsyn = 0 valid_sites = False for nsite in range(site, min(len(dns),site+window_size)): (s,n) = (dss[nsite], dns[nsite]) # don't consider cases with missing counts if not (s is None) and not (n is None): syn += s nsyn += n valid_sites = True if valid_sites: ns.append(nsyn) ss.append(syn) return stats.PearsonCorrelation(ns,ss), ns, ss
def getShortestDistanceHits(hits, queryGeneDict, targetGeneDict, queryProtDict, targetProtDict, distanceCache, alignmentCache): queryToTargetSDHits = {} totalHits = len(hits.keys()) nHits = 0 for (queryGene, qHits) in hits.items(): minDist = 100.0 minHitGene = None for targetGene in qHits: key = cacheKey(queryGene, targetGene) dNML = 0.0 try: (dDML, dSML, dNML, dDNG, dSNG, dNNG, numSynonymousSites, numNonsynonymousSites, fracAligned, seqIdentity) = distanceCache[key] except KeyError: queryGeneSeq = queryGeneDict[queryGene] targetGeneSeq = targetGeneDict[targetGene] [alignedQueryProt, alignedTargetProt] = muscle.align_sequences([queryProtDict[queryGene], targetProtDict[targetGene]]) alignedQueryGene = muscle.align_gene_from_protein(queryGeneSeq, alignedQueryProt) alignedTargetGene = muscle.align_gene_from_protein(targetGeneSeq, alignedTargetProt) (dDML, dSML, dNML, dDNG, dSNG, dNNG, numSynonymousSites, numNonsynonymousSites) = my_paml.Get_Distance_NS(alignedQueryGene, alignedTargetGene, 'codon') (seqIdentity, numIdentical, numAligned) = sequenceIdentity(alignedQueryProt, alignedTargetProt) fracAligned = numAligned/float(len(queryProtDict[queryGene])) if fracAligned == 1.0 and seqIdentity == 1.0: dNML = 0.0 # Obviously no nonsyn. changes if proteins are identical elif 1.0/numNonsynonymousSites > dNML: dNML = 1.0/numNonsynonymousSites # Minimum possible change (ntseqIdentity, ntnumIdentical, ntnumAligned) = sequenceIdentity(alignedQueryGene, alignedTargetGene) ntfracAligned = ntnumAligned/float(len(queryGeneSeq)) if ntfracAligned == 1.0 and ntseqIdentity == 1.0: dSML = 0.0 # Obviously no syn. changes if genes are identical elif 1.0/numSynonymousSites > dSML: dSML = 1.0/numSynonymousSites # Minimum possible change # Cache this distance and alignment distanceCache[key] = (dDML, dSML, dNML, dDNG, dSNG, dNNG, numSynonymousSites, numNonsynonymousSites, fracAligned, seqIdentity) alignmentCache[key] = ((queryGene, alignedQueryGene, alignedQueryProt), (targetGene, alignedTargetGene, alignedTargetProt)) # Check to see if this is a shorter distance if dNML < minDist and targetGene != queryGene: minDist = dNML minHitGene = targetGene nHits += 1 (dDML, dSML, dNML, dDNG, dSNG, dNNG, numSynonymousSites, numNonsynonymousSites, fracAligned, seqIdentity) = distanceCache[cacheKey(queryGene, minHitGene)] print "# %d of %d: %s was %s, dN = %1.6f, fracAlign = %1.6f" % (nHits, totalHits, queryGene, minHitGene, minDist, fracAligned) queryToTargetSDHits[queryGene] = minHitGene return queryToTargetSDHits
xprot for ((xgenome, xorf), xprot) in zip(corr_keys, aligned_prots) if xgenome in tree_species ] all_genes = [] for (xgenome, xorf) in corr_keys: if xgenome in tree_species: try: seq = cdna_dicts[xgenome][xorf] except KeyError: seq = cdna_dicts[xgenome + '-mit'][xorf] all_genes.append(seq) all_species = tree_species #[xgenome for (xgenome, xorf) in corr_keys] assert len(all_genes) == len(all_prots) assert len(all_species) == len(all_genes) all_aligned_genes = [ muscle.align_gene_from_protein(xgene, xprot) for (xgene, xprot) in zip(all_genes, all_prots) ] # Put the genes in the right order sub_gene_dict = dict(zip(all_species, all_aligned_genes)) recon_gene_list = [sub_gene_dict[spec] for spec in tree_species] try: #(rates,ancestor) = (1,2) (dns, dss) = my_paml.Get_dNdS_Per_Codon(recon_gene_list, tree_species, tree, 100) #(rates, ancestor) = my_paml.Get_Site_Rates_And_Ancestor(recon_gene_list, tree_species, tree) (r, p, n) = stats.PearsonCorrelation(dns, dss) print "# %d %s %1.3f" % (n_genes, gene, r) except my_paml.PAMLError, pe:
print "# rejected orf (mfal,msid) %s (%1.2f,%1.2f)" % (gene, mfal, msid) continue all_prots = [xprot for ((xgenome,xorf), xprot) in zip(corr_keys,aligned_prots) if xgenome in tree_species] all_genes = [] for (xgenome, xorf) in corr_keys: if xgenome in tree_species: try: seq = cdna_dicts[xgenome][xorf] except KeyError: seq = cdna_dicts[xgenome+'-mit'][xorf] all_genes.append(seq) all_species = tree_species #[xgenome for (xgenome, xorf) in corr_keys] assert len(all_genes) == len(all_prots) assert len(all_species) == len(all_genes) all_aligned_genes = [muscle.align_gene_from_protein(xgene,xprot) for (xgene,xprot) in zip(all_genes,all_prots)] # Put the genes in the right order sub_gene_dict = dict(zip(all_species, all_aligned_genes)) recon_gene_list = [sub_gene_dict[spec] for spec in tree_species] try: #(rates,ancestor) = (1,2) (dns, dss) = my_paml.Get_dNdS_Per_Codon(recon_gene_list, tree_species, tree, 100) #(rates, ancestor) = my_paml.Get_Site_Rates_And_Ancestor(recon_gene_list, tree_species, tree) (r, p, n) = stats.PearsonCorrelation(dns, dss) print "# %d %s %1.3f" % (n_genes, gene, r) except my_paml.PAMLError, pe: print "#",pe continue #print len(rates), len(ancestor)