Пример #1
0
def mpat(tr, seqs) :
  dseqs = dict(seqs)
  for n in getPostOrder(tr) :
    data = n.data
    if not n.succ :
      data.seq = (None,dseqs[n.data.taxon])
    else :
      s1,s2 = [tr.node(x).data.seq for x in n.succ]
      if s1[1] :
        if s2[1] :
          a = calign.globalAlign(s1[1],s2[1])
          data.seq = (calign.createProfile(a),None)
        else :
          p1,p2 = calign.createProfile(s1[1:]), s2[0]
          #assert all([sum(x)==sum(p1[0]) for x in p1])
          #assert all([sum(x)==sum(p2[0]) for x in p2])
          pa = calign.prof2profAlign(p1,p2)
          data.seq = (trimp(pa),None)
          #print len(pa)
      else :
        p1 = s1[0]
        if s2[1] :
          p2 = calign.createProfile(s2[1:])
        else :
          p2 = s2[0]
        #assert all([sum(x)==sum(p1[0]) for x in p1])
        #assert all([sum(x)==sum(p2[0]) for x in p2])
        pa = calign.prof2profAlign(p1,p2)
        data.seq = (trimp(pa),None)
        #print len(pa)
        #import pdb; pdb.set_trace()
  assert n.id == tr.root
  return n.data.seq[0]
Пример #2
0
def guesstimateTH(seqs, nMax, thc, matches = None, scores = None, ns = 100) :
  q = 1- 1./((len(seqs)/nMax)-.5)
  if q <= 0 :
    return None

  d = [[calign.globalAlign(x1,x2, report = calign.JCcorrection, scores=scores) for (x1,x2) in
      [random.sample(seqs, 2)]][0] for k in range(4000)]
  #import pdb; pdb.set_trace()
  
  p = ([int(nMax * q**n) for n in range(0,50)])
  th = sum([nPairs(x) for x in p])/nPairs(len(seqs))

  sols = []
  for i in range(40) :
    d1 = random.sample(d,2000)
    sols.append(sorted(d1)[int(th*len(d1))])
    
  #sol = sorted(d)[int(th*len(d))]
  #sol = mean(sols)
  sol = sorted(sols)[(40 * 9)//10]
  ## f = lambda t : (sum([x < t for x in d])/len(d)) - th
  #import pdb; pdb.set_trace()

  ## if f(0) * f(1) < 0 :
  ##   sol = brentq(f, 0, 1)
  ## else :
  ##   sol = thc/2
  return sol/3
Пример #3
0
def mpa(tr, seqs, scores = defaultMatchScores, trimEnd = None) :
  dseqs = dict(seqs)
  #scores = (None,None,gapPenalty,feg)
  for n in getPostOrder(tr) :
    data = n.data
    if not n.succ :
      data.seq = (None,dseqs[n.data.taxon.strip("'")])
    else :
      s1,s2 = [tr.node(x).data.seq for x in n.succ]
      if s1[1] :
        if s2[1] :
          a = calign.globalAlign(s1[1],s2[1], scores = scores)
          data.seq = (calign.createProfile(a),None)
        else :
          p1,p2 = calign.createProfile(s1[1:]), s2[0]
          pa = calign.prof2profAlign(p1,p2, scores = scores)
          data.seq = (trimendsp(pa, trimEnd) if trimEnd is not None else pa,None)
          #print len(pa)
      else :
        p1 = s1[0]
        if s2[1] :
          p2 = calign.createProfile(s2[1:])
        else :
          p2 = s2[0]
        pa = calign.prof2profAlign(p1,p2, scores = scores)
        data.seq = (trimendsp(pa, trimEnd) if trimEnd is not None else pa,None)
        #print len(pa)
        #import pdb; pdb.set_trace()
  assert n.id == tr.root
  return n.data.seq[0]
Пример #4
0
def seqMultiAlign(seqs, scores = defaultMatchScores, report=False) :
  if len(seqs) < 2:
    return seqs
  
  a = calign.globalAlign(seqs[0], seqs[1], scores=scores)
  ns = 2
  p = calign.createProfile(a)

  a = tuple(iton(x) for x in a)
  
  for kk,s2 in enumerate(seqs[2:]) :
    #print ns
    # assert p == calign.createProfile(a)
    assert len(a[0]) == len(p) and \
           p[0][calign.GAP] != ns and p[-1][calign.GAP] != ns
    
    pad = 20
    if len(p)+2*pad < len(s2) :
      # enough for sequences start to align
      pad = (len(s2) - len(p))  ;                assert len(p)+2*pad >= len(s2)

    pa,extendLeft,extendRight = calign.profileAlign(s2, p, pad=pad, chop=True,
                                                    gapPenalty=defaultMatchScores.gap)

    if extendLeft > 0 or extendRight > 0 :
      gapProfile = [0,0,0,0,0,ns]
      p1 = tuple((list(gapProfile) for k in range(extendLeft))) + p \
           + tuple((list(gapProfile) for k in range(extendRight)))
    else :
      p1 = p
    
    for k,n in enumerate(pa) :
      p1[k][n] += 1
    p = p1
      
    if extendLeft > 0 or extendRight > 0 :
      fr = '-'*extendLeft
      bk = '-'*extendRight
      a = tuple((fr + x + bk for x in a))
      
    a = a + (iton(pa),)
    ns += 1

    if report and (kk+1) % 1000 == 0 :
      import sys
      print kk+1, len(a[0]),
      sys.stdout.flush()
  if report: print
  
  return a
Пример #5
0
  def getDist(i,j) :
    mi,mj = mhs[i],mhs[j]
    anyCons = False
    if mi < lowDiversity :
      ri = [getCons(i)]
      anyCons = True
    else :
      ri = getReps(i)
    if mj < lowDiversity :
      rj = [getCons(j)]
      anyCons = True
    else :
      rj = getReps(j)

    nhs = len(ri)*len(rj)
    if nhs == 1 :
      h = calign.globalAlign(ri[0], rj[0], scores = defaultMatchScores,
                             report = calign.JCcorrection)
    else :
      ap = calign.allpairs(ri, rj, align=True, scores = defaultMatchScores,
                           report = calign.JCcorrection)
      h = sum([sum(x) for x in ap])/nhs
      
    global acnt
    acnt += nhs
    
    lowLim = 2*max(mi,mj)
    
    if anyCons and (h < lowLim or (h < refineUpperLimit and h < lowLim*refineFactor)) :
      xri = getReps(i) if len(ri) == 1 else ri
      xrj = getReps(j) if len(rj) == 1 else rj

      if ri != xri or rj != xrj :
        ap1 = calign.allpairs(xri, xrj, align=True, scores = defaultMatchScores,
                              report = calign.JCcorrection)
        h1 = sum([sum(x) for x in ap1])

        xnhs = (len(xri)*len(xrj))
        acnt += xnhs
      
        h = (h * nhs + h1)/(nhs + xnhs)
    return max(h, lowLim)
Пример #6
0
  def guesstimateTH(seqs, nMax, thc, matches = None, scores = None, ns = 100) :
    d = [[calign.globalAlign(x1,x2, report = calign.JCcorrection, scores = scores) for (x1,x2) in
        [random.sample(seqs, 2)]][0] for k in range(2000)]
    q = 1- 1./(len(seqs)/nMax)
    p = ([int(nMax * q**n) for n in range(0,100)])
    th = sum([nPairs(x) for x in p])/nPairs(sum(p))/2
    f = lambda t : (sum([x < t for x in d])/len(d)) - th
    #import pdb; pdb.set_trace()

    if f(0) * f(1) < 0 :
      sol = brentq(f, 0, 1)
    else :
      sol = thc/2
    return sol



    k = len(seqs)//nMax
    if max(k,ns) > len(seqs)//50 :
      return thc/2
    sq = random.sample(seqs, max(k,ns))
    t,ds = treeFromSeqs(sq, matchScores = scores)
    del ds
    th = sorted(nodeHeights(t).values())[-(ns//k)]
    #import pdb; pdb.set_trace()
    return th

    if matches is None :
      #print "matches"
      matches = _buildLookupC(seqs)
    #print "done matches"
    # lseqs = [len(x) for x in seqs]
    fdis = lambda s,j : calign.globalAlign(s, seqs[j], scores = scores,
                                           report = calign.JCcorrection)

    nInClade = max((len(seqs)//nMax) + 1, nMax)
    lim = 2*nInClade
    v = []
    li = random.sample(range(len(seqs)), min(ns,len(seqs)))
    for i in li:
      p = getMates(seqs[i], seqs, .3, fdis, matches, lim = lim)
      v.append((i,sorted(p[1])))    


    f3 = lambda x : mean(x) if len(x) else 1
    f1 = lambda th : f3([1./x for x in [sum([x < th for x in u[1]]) for u in v] if x > 0])
    #import pdb; pdb.set_trace()
    #f1 = lambda th : mean([1/max(sum([x < th for x in u[1]]),0.0001) for u in v])
    if f1(1) >= 1./nInClade :
      f = lambda x : f1(1) * 1.01 - f1(x) 
    else :
      f = lambda x : f1(x) - 1./nInClade

    assert f(0) * f(1) < 0
    sol = brentq(f, 0, 1)

    f2 = lambda th : 1 - (sum([sum([x < th for x in u[1]]) for u in v])/sum([len(u[1]) for u in v]))
    f = lambda x : f2(x) - 1./nInClade
    #import pdb; pdb.set_trace()
    assert f(0) * f(1) <= 0
    sol1 = brentq(f, 0, 1)

    th = max(sol,sol1)
    th = int(th*1000+.5)/1000
    return th,matches
Пример #7
0
def deClutter(seqs, th, correction, failsTH = 20, matches = None,
              matchScores = defaultMatchScores, verbose = None) :
  if verbose:
    print >> verbose, "declutter",len(seqs),"at", "%g" % th
    tmain = time.clock()
    print >> verbose, "building lookup table ..., ",

  _failsHardTH = failsTH

  if matches is None :
    matches = _buildLookupC(seqs)
  N = len(seqs)

  if verbose:
    print >> verbose, "done."
    print >> verbose, "n n-matched #singles #pairs #groups #matched-now #new-matched"

  # Duplicate code for speed 
  if correction :
    fdis = lambda i,j : calign.globalAlign(seqs[i], seqs[j], scores = matchScores,
                                           report = calign.JCcorrection)
  else :
    fdis = lambda i,j : calign.globalAlign(seqs[i], seqs[j], scores = matchScores,
                                           report = calign.DIVERGENCE)
    
  doStats = True
  if doStats :
    totTries = totFails = 0

  emptys = set()
  
  elem2grps = dict()
  lseqs = [len(s) for s in seqs]
  getp = lambda j : getPotentials(j, seqs, lseqs, th, fdis, matches, elem2grps, rs = rs)
  # 0,1 : min( seq-ident producing a match below th ), count
  # 2,3 - sum of p(first fail before terminating), count
  rs = [2,0,1,0]
  
  grps = []
  
  singles = set()
  pairs = list()
  pairsassingles = set()
  paired = set()
  for jSeq in range(N) :
    if jSeq in paired :
      continue

    pp = getp(jSeq)
    if doStats :
      iCans,sd,stat = pp
      totTries += stat[1]
      totFails += stat[0]
    else:
      iCans,sd = pp

    done = False
    if len(iCans) == 0 :
      singles.add(jSeq)
      done = True
      
    # Matching a previous single happens
    elif len(iCans) == 1 and iCans[0] not in paired \
             and iCans[0] not in singles :
      partner = iCans[0]
      pp = getp(partner)
      if doStats :
        jCans,sd,stat1 = pp
        totTries += stat1[1]
        totFails += stat1[0]
        stat = (stat[0],stat[1], stat[2] + stat1[2])
      else :
        jCans,sd = pp
      
      if len(jCans) == 1 and jCans[0] == jSeq :
        pairs.append([jSeq,partner,sd[0]])

        # probably not safe: avoid duplicates but those will not be examined
        # again for possible removal
        paired.add(jSeq)
        paired.add(partner)

        assert all([x not in singles and x not in pairsassingles for x in
                    [jSeq,partner]])
        
        pairsassingles.add(jSeq)
        pairsassingles.add(partner)
        
        done = True
      else :
        iCans = set(jCans + iCans)
        iCans.discard(jSeq)
        iCans = list(iCans)
        assert iCans is not None

    if verbose and not done:
      print >> verbose, jSeq, len(paired), "%.4g%%" % (100*(len(paired)/len(seqs))),\
            len(singles), len(pairs), len(grps), len(iCans)+1, \
            sum([x not in paired for x in iCans])+1,\
            ("(%d %d  %d)" % (totTries,totFails,stat[2])) if doStats else ""
        
    if not done:
      g = [jSeq] + iCans
        
      paired.add(jSeq)
      sRemoved = []
      for x in iCans:
        if x in singles:
          sRemoved.append(x)
        singles.discard(x)
        paired.add(x)

      if verbose and len(sRemoved) > 0:
        print >> verbose, len(sRemoved),"removed from singles",\
              '('+",".join([str(x) for x in sRemoved])+')'

      for x in g :
        if x in pairsassingles :
          for k,(a,b,d) in enumerate(pairs):
            if a == x or b == x :
              del pairs[k]
              if a not in g :
                g.append(a)
              if b not in g :
                g.append(b)
              break
      if True:
        for x in g :
          if x in elem2grps:
            if not isinstance(g, set):
              g = set(g)
            g.update(elem2grps[x])
        if isinstance(g, set) :
          g = list(g)
        for x in g :
          elem2grps[x] = g
      grps.append(g)

  if verbose:
    if doStats:
      print >> verbose, totTries, "matching alignments,", totFails,"fails."
    print >> verbose, "merging", len(grps), "groups",

  if 0 :
    ww =file("/tmp/gg",'w')
    for g in grps:
      for x in g:
        print >> ww, x,
      print >> ww
    ww.close()
    
  mgrps = sorted(mergeGroupings(grps), key = lambda x : len(x), reverse=0)
  
  if verbose:
    print >> verbose,"done, ", len(mgrps), "groups."
    
  return list(singles),pairs,mgrps
Пример #8
0
  k = len(seqs)//nMax
  if max(k,ns) > len(seqs)//50 :
    return thc/2
  sq = random.sample(seqs, max(k,ns))
  t,ds = treeFromSeqs(sq, matchScores = scores)
  del ds
  th = sorted(nodeHeights(t).values())[-(ns//k)]
  #import pdb; pdb.set_trace()
  return th
  
  if matches is None :
    #print "matches"
    matches = _buildLookupC(seqs)
  #print "done matches"
  # lseqs = [len(x) for x in seqs]
  fdis = lambda s,j : calign.globalAlign(s, seqs[j], scores = scores,
                                         report = calign.JCcorrection)

  nInClade = max((len(seqs)//nMax) + 1, nMax)
  lim = 2*nInClade
  v = []
  li = random.sample(range(len(seqs)), min(ns,len(seqs)))
  for i in li:
    p = getMates(seqs[i], seqs, .3, fdis, matches, lim = lim)
    v.append((i,sorted(p[1])))    

  
  f3 = lambda x : mean(x) if len(x) else 1
  f1 = lambda th : f3([1./x for x in [sum([x < th for x in u[1]]) for u in v] if x > 0])
  #import pdb; pdb.set_trace()
  #f1 = lambda th : mean([1/max(sum([x < th for x in u[1]]),0.0001) for u in v])
  if f1(1) >= 1./nInClade :