Пример #1
0
  def getMates(seq, seqs, th, fdis, matches, lim = -1, failsTH = 30) :
    cans = [0]*len(seqs)
    cclust.counts(seq, matches, cans)

    lseq = len(seq)
    scans = [(-x/min(lseq,len(seqs[k])), k) for k,x in enumerate(cans) if x > 0]
    heapq.heapify(scans)

    fails = 0
    matched = []
    distances = []

    for cn,k in hiter(scans) :
      if lim > 0 and len(matched) >= lim:
        break

      djk = fdis(seq, k)
      if djk <= th :
        matched.append(k)
        distances.append(djk)

        fails = 0
      else :
        fails += 1
        if fails > failsTH:
          break
    return matched,distances
Пример #2
0
def getMates(seq, seqs, lseqs, th, fdis, matches, lim = -1, failsTH = 30) :
  fails = 0
  matched = []
  distances = []

  q = cclust.counts(seq, matches, len(seqs), lseqs)

  for cn,k in hiternew(q) :
    if lim > 0 and len(matched) >= lim:
      break

    djk = fdis(seq, k)
    if djk <= th :
      matched.append(k)
      distances.append(djk)

      fails = 0
    else :
      fails += 1
      if fails > failsTH:
        break
  return matched,distances
Пример #3
0
def getPotentials(iSeq, seqs, lseqs, th, fdis, matches, elem2grps, rs = None) :
  seq = seqs[iSeq]

  stats = True
  ## cclust.counts does this counting faster
  ##
  ##   cans = [0]*len(seqs)
  ##   for i in range(len(seq)-11) :
  ##     p = seq[i:i+11]
  ##     ms = matches.get(p)
  ##     if ms :
  ##       for m in ms:
  ##         cans[m] += 1
  ##   cans[iSeq] = 0

  cans = [0]*len(seqs)
  cclust.counts(seq, matches, cans)
  cans[iSeq] = 0

  lseq = len(seq)
  # min is expensive (seriously! the volume of calls is staggering)
  # len too, that is why we pre-compute them.
  scans = [(-x/(lseq if lseq <= lseqs[k] else lseqs[k]), k)
           for k,x in enumerate(cans) if x > 0]
  
  # heap instead of full sorting, we usually need only a very small fraction of
  # the elements. Should move to faster C code. heapq is pure python
  heapq.heapify(scans)
      
  if rs:
    pfails,pok = 0,0
  tries = 0

  # Sequences we do not need to consider
  alreadyMatched = set()
  
  fails = 0
  matched = []
  distances = []

  # should we make the q sub-order on length (same nmatches, bring shorter one first)
  # Or another way to handle multiples from longer sequences?
  
  for cn,k in hiter(scans):
    if k in alreadyMatched :
      continue
    
    tries += 1

    djk = fdis(iSeq,k)
    if djk <= th :
      matched.append(k)
      distances.append(djk)

      fails = 0

      g = elem2grps.get(k)
      if g :
        for x in g :
          alreadyMatched.add(x)
          
      if rs:
        pok = cans[k]/min(lseqs[k],lseq)
        if pok < rs[0] :
          rs[0] = pok
        rs[1] += 1
        
    else :
      if rs and fails == 0 :
        pfails = cans[k]/min(lseqs[k],lseq)
      # rs[0] minimal accepted rs[1] n-accepted rs[2] total fails, rs[3] n-fails
      fails += 1
      
      # One mismatch can cause such a drop. dont give up before that.
      # heigher scores are possible due to multiple matchings.
      if cans[k] <= lseq-21 :
        if fails > _failsHardTH :
          break
        if fails > 1 and rs and rs[1] > 500 and rs[3] > 500 :
          p = cans[k]/min(lseqs[k],lseq)
          if p < .99*rs[0] and p < (rs[2]/rs[3] + rs[0])/2 :
            break
  if rs :
    rs[2] += pfails
    rs[3] += 1

  return matched,distances, (fails,tries,len(scans))