def search(self,dnaSeq,pValThresh=0.05,halfAT=0.25,halfGC=0.25): """Return motility hits tuple.""" def _setThresh(newPval): """Return threshold that is approximately exquivilent to a p-value of 'pValThresh'""" maxScore = mot.max_score() minScore = mot.min_score() # to be used for lower bound (but later.. :( ) scoreSteps = [maxScore*(x/100.0) for x in range(1,101)] scoreSteps.reverse() thresh = maxScore for score in scoreSteps: pVal = mot.weight_sites_over(score,AT_bias=halfAT, GC_bias=halfGC) print 'score: %s pVal: %s ' % (score,pVal) if pVal <= newPval: thresh = score elif pVal > newPval: break if pVal <= newPval: return thresh else: return None if not self._motility: self._motility = motility.make_pwm(self._getMotilityMatrix()) self._motility.threshPval = pValThresh self._motility.threshScore = _setThresh(pValThresh) if not self._motility.threshPval == pValThresh: self._motility.threshScore = _setThresh()
def test_5(): """ Test calc_score & calc_energy equivalence """ motif = 'ACGG' pwm = motility.make_pwm([motif]) operator = motility.make_operator([motif]) print pwm.calc_score(motif) print operator.calc_score(motif) print operator.calc_energy(motif) assert operator.calc_score(motif) == operator.calc_energy(motif)
def test_4(): """ Test misc coord handling / match str extraction """ motif = 'ACGG' pwm = motility.make_pwm([motif]) pwm_match = pwm.find(motif, 4) iupac_match = motility.find_iupac(motif, motif) exact_match = motility.find_exact(motif, motif) assert pwm_match == iupac_match assert pwm_match == exact_match rcmotif = 'CCGT' pwm_match = pwm.find(rcmotif, 4) iupac_match = motility.find_iupac(rcmotif, motif) exact_match = motility.find_exact(rcmotif, motif) assert pwm_match == iupac_match
def get_consensus_snv_arrary( dic_chr_pos_snv, chr, snv_pos_list, snv_arrary_list ): # get consensus SNV (also called 'phasing') using Position-Weight Matrices (commonly used in DNA/RNA/Protein motif finding) # first step: filter some SNV positions with many missing data retain_snv_idx = [] retain_snv_pos_list = [] for i in range(len(snv_pos_list)): ret_snv_count = 0 for snv_arrary in snv_arrary_list: if snv_arrary[i] != "-": ret_snv_count += 1 if float(ret_snv_count) / len( snv_arrary_list ) >= 0.5: # require >50% long read have nucleotide retain_snv_idx.append(i) retain_snv_pos_list.append(str(snv_pos_list[i])) retain_snv_pos = ",".join(retain_snv_pos_list) # second step: get the consensus (phased major allele) if retain_snv_idx != []: # have valid SNV position for constructing new SNV arrary # obtain new SNV arrary new_snv_arrary_list = [] for snv_arrary in snv_arrary_list: new_snv_arrary = "" for idx in retain_snv_idx: new_snv_arrary += snv_arrary[idx] new_snv_arrary_list.append(new_snv_arrary) # obtain SNV arrary for constructing consensus consus_snv_arrary_list = [] for new_snv_arrary in new_snv_arrary_list: if "-" not in new_snv_arrary: consus_snv_arrary_list.append(new_snv_arrary) if consus_snv_arrary_list != []: # have valid SNV arrary for constructing consensus # get major allele by most common element in the list counter_cm_ele_dic = collections.Counter(consus_snv_arrary_list) most_cm_ele = counter_cm_ele_dic.most_common(2) if most_cm_ele[0][ 1] >= 5: # if the snv_arrary is support by >= 5 long reads major_allele = most_cm_ele[0][0] else: # get major allele by PWM pwm = motility.make_pwm( consus_snv_arrary_list) # PWM calculate dic_snv_uniq_score = {} for snv_uniq in set(consus_snv_arrary_list): snv_score = float(pwm.calc_score(snv_uniq)) if snv_score not in dic_snv_uniq_score.keys(): dic_snv_uniq_score[snv_score] = [] dic_snv_uniq_score[snv_score].append(snv_uniq) else: dic_snv_uniq_score[snv_score].append(snv_uniq) max_consus_snv = dic_snv_uniq_score[max( dic_snv_uniq_score.keys() )][0] # get SNV arrary with maximal PWM score as major allele major_allele = max_consus_snv # get minor allele minor_phase_list = [] for i in range(len(retain_snv_pos_list)): if dic_chr_pos_snv[chr][int( retain_snv_pos_list[i])][0] != major_allele[i]: minor_phase_list.append(dic_chr_pos_snv[chr][int( retain_snv_pos_list[i])][0]) else: minor_phase_list.append(dic_chr_pos_snv[chr][int( retain_snv_pos_list[i])][1]) minor_allele = "".join(minor_phase_list) # quantify allele-specific read count for both alleles major_allele_c, minor_allele_c, uncertain_allele_c = 0, 0, 0 for new_snv_arrary in new_snv_arrary_list: common_count_major = sum( 1 for a, b in zip(major_allele, new_snv_arrary) if a == b) common_count_minor = sum( 1 for a, b in zip(minor_allele, new_snv_arrary) if a == b) if float(common_count_major) / len(major_allele) > 0.5: major_allele_c += 1 elif float(common_count_minor) / len(minor_allele) > 0.5: minor_allele_c += 1 else: uncertain_allele_c += 1 output_res = "\t".join([ retain_snv_pos, major_allele, minor_allele, str(major_allele_c), str(minor_allele_c), str(uncertain_allele_c) ]) else: output_res = "\t".join(["*", "*", "*", "*", "*", "*"]) else: output_res = "\t".join(["*", "*", "*", "*", "*", "*"]) return output_res
from TAMO.MotifTools import Motif import motility tM = Motif('WGATAR') sites = tM.bogus_kmers() tM = Motif(sites) mM = motility.make_pwm(sites) s = 'ATGCATGCTAGCGGCTGATAACGCTTATCATATGC' mReults = mM.find(s,mM.max_score()*0.75,)
targetGenes = "/Users/biggus/Documents/James/Data/ReClustering/kmedsPear33Clus50x_2/Clus2_247genes.genes.txt" targetGenes = map(lambda l: l.strip(), open(targetGenes, "rU")) targetGenes = targetGenes[:genes] for i in range(len(targetGenes)): targetGenes[i] = seqs[targetGenes[i]] motifs = [] tMotifs = [] mMotifs = [] for t in tmoFiles: Ms = loadTMOs(t) motifs.extend(Ms) for i in range(len(motifs)): tMotifs.append(Motif(motifs[i].bogus_kmers())) mMotifs.append(motility.make_pwm(motifs[i].bogus_kmers())) tTimeIt = """for m in tMotifs: for s in targetGenes: m.scan(s,factor=0.75) """ mTimeIt = """for m in mMotifs: for s in targetGenes: m.find(s,m.max_score()*0.85) """ tTimer = timeit.Timer(tTimeIt, "from __main__ import tMotifs,targetGenes") mTimer = timeit.Timer(mTimeIt, "from __main__ import mMotifs,targetGenes")