def getMates(seq, seqs, th, fdis, matches, lim = -1, failsTH = 30) : cans = [0]*len(seqs) cclust.counts(seq, matches, cans) lseq = len(seq) scans = [(-x/min(lseq,len(seqs[k])), k) for k,x in enumerate(cans) if x > 0] heapq.heapify(scans) fails = 0 matched = [] distances = [] for cn,k in hiter(scans) : if lim > 0 and len(matched) >= lim: break djk = fdis(seq, k) if djk <= th : matched.append(k) distances.append(djk) fails = 0 else : fails += 1 if fails > failsTH: break return matched,distances
def getMates(seq, seqs, lseqs, th, fdis, matches, lim = -1, failsTH = 30) : fails = 0 matched = [] distances = [] q = cclust.counts(seq, matches, len(seqs), lseqs) for cn,k in hiternew(q) : if lim > 0 and len(matched) >= lim: break djk = fdis(seq, k) if djk <= th : matched.append(k) distances.append(djk) fails = 0 else : fails += 1 if fails > failsTH: break return matched,distances
def getPotentials(iSeq, seqs, lseqs, th, fdis, matches, elem2grps, rs = None) : seq = seqs[iSeq] stats = True ## cclust.counts does this counting faster ## ## cans = [0]*len(seqs) ## for i in range(len(seq)-11) : ## p = seq[i:i+11] ## ms = matches.get(p) ## if ms : ## for m in ms: ## cans[m] += 1 ## cans[iSeq] = 0 cans = [0]*len(seqs) cclust.counts(seq, matches, cans) cans[iSeq] = 0 lseq = len(seq) # min is expensive (seriously! the volume of calls is staggering) # len too, that is why we pre-compute them. scans = [(-x/(lseq if lseq <= lseqs[k] else lseqs[k]), k) for k,x in enumerate(cans) if x > 0] # heap instead of full sorting, we usually need only a very small fraction of # the elements. Should move to faster C code. heapq is pure python heapq.heapify(scans) if rs: pfails,pok = 0,0 tries = 0 # Sequences we do not need to consider alreadyMatched = set() fails = 0 matched = [] distances = [] # should we make the q sub-order on length (same nmatches, bring shorter one first) # Or another way to handle multiples from longer sequences? for cn,k in hiter(scans): if k in alreadyMatched : continue tries += 1 djk = fdis(iSeq,k) if djk <= th : matched.append(k) distances.append(djk) fails = 0 g = elem2grps.get(k) if g : for x in g : alreadyMatched.add(x) if rs: pok = cans[k]/min(lseqs[k],lseq) if pok < rs[0] : rs[0] = pok rs[1] += 1 else : if rs and fails == 0 : pfails = cans[k]/min(lseqs[k],lseq) # rs[0] minimal accepted rs[1] n-accepted rs[2] total fails, rs[3] n-fails fails += 1 # One mismatch can cause such a drop. dont give up before that. # heigher scores are possible due to multiple matchings. if cans[k] <= lseq-21 : if fails > _failsHardTH : break if fails > 1 and rs and rs[1] > 500 and rs[3] > 500 : p = cans[k]/min(lseqs[k],lseq) if p < .99*rs[0] and p < (rs[2]/rs[3] + rs[0])/2 : break if rs : rs[2] += pfails rs[3] += 1 return matched,distances, (fails,tries,len(scans))