示例#1
0
def main():
    usage  = "usage: %prog [options]"
    desc   = """Parse multi-fasta file and report sequences without overlap
with already reported sequences. Starts from the longest."""
    epilog = ""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-i", dest="infile",  
                      help="multi-fasta file       [mandatory]")
    parser.add_option("-m", dest="minIdentity",  default=90, type=int,
                      help="min identity           [%default]")
    parser.add_option("-o", dest="overlap",  default=0.3, type=float,
                      help="max overlap allowed    [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true" )
    
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,args ) )

    for fn in [ o.infile, ]:
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #load fastas
    fastas = genome2dict( o.infile )

    #contigs by descending length
    contigs = sorted( fastas.keys(),key=lambda x: len(fastas[x]), reverse=True )

    #report non-overlapping
    i = 0
    added,skipped = set(), set()
    ##remove outfile if exists
    outfn = o.infile + ".collapsed_o%s_i%s.fa" % ( o.overlap,o.minIdentity )
    if os.path.isfile( outfn ):
        os.unlink( outfn )
    ##execute blat vs itself
    pslfn = run_blat( o.infile,o.infile,o.minIdentity,o.verbose )
    matches = parse_blat( pslfn,o.verbose,header=0,skipSelfMatches=1 )
    ##add contigs without overlap
    for c in contigs:
        i += 1
        if o.verbose:
            sys.stderr.write( " %3s %20s [ %7.2f kb]\n" % (i,c,len(fastas[c])/1000.0) )
        #get fasta entry
        fasta = ">%s\n%s\n" % (c,_get_formatted_seq(fastas[c]))
        #save contig if first or if no overlapping already processed
        if not added or not overlapping( c,added,matches,o.overlap,o.verbose ):
            added.add( c )
            out = open(outfn,"a"); out.write( fasta ); out.close()
        else:
            skipped.add( c )

    sys.stderr.write( "Selected %s [ %7.2f kb] out of %s [ %7.2f kb] contigs.\n" % ( len(added),sum([len(fastas[c]) for c in added])/10.0**3,len(fastas),sum([len(fastas[c]) for c in fastas])/10.0**3) )
示例#2
0
def sort_hits(matches, query2fasta, ref2fasta, outbase, qOverlapTh, haploid,
              monoploid, verbose):
    """Return sorted multifasta of monoploid genomes.
    Contigs are sorted based on reference alignment.
    """
    #get best query to reference pairs
    ##prepare nested dictionary
    q2r = {}
    for q in query2fasta:
        q2r[q] = {}
    ##all query to ref
    for r, rStart, rStop, q, qStart, qStop, identity in matches:
        #for rStart,rStop,q,qStart,qStop,identity in matches[r]:
        qAligned = abs(qStop - qStart)
        #define if forward or reverse alg
        fwd = rev = 0
        if qStop < qStart:
            rev = qAligned
        else:
            fwd = qAligned
        #store alg info
        if r not in q2r[q]:
            q2r[q][r] = [qAligned, rStart, rStop, fwd, rev]
            continue
        q2r[q][r][0] += qAligned
        if rStart < q2r[q][r][1]:
            q2r[q][r][1] = rStart
        if rStop > q2r[q][r][2]:
            q2r[q][r][2] = rStop
        #add fwd,rev
        q2r[q][r][3] += fwd
        q2r[q][r][4] += rev

    ##get best match for each query
    q2rBest = {}
    for q in query2fasta:  #qSorted:
        refs = sorted(q2r[q].iteritems(),
                      key=lambda x: q2r[q][x[0]][0],
                      reverse=True)
        #skip contigs without a match or with too small fraction aligned
        if not refs or refs[0][1][0] < qOverlapTh * len(query2fasta[q]):
            continue
        q2rBest[q] = refs[0]
        print q, refs

    qOut = open("%s.sorted.fa" % outbase, "w")
    for r, rStart, rStop, q, qStart, qStop, identity in matches:
        if q not in q2rBest:
            continue
        #check if current r is the best for given q
        if r != q2rBest[q][0]:
            continue
        #pop given q from dictionary so it's saved only once
        qAligned, rStart, rStop, fwd, rev = q2rBest.pop(q)[1]
        #save sequence
        if fwd > rev:
            qOut.write(">%s\n%s\n" % (q, _get_formatted_seq(query2fasta[q])))
        #or it's reverse complement if more reverse aligned
        else:
            qOut.write(
                ">%s|rev\n%s\n" %
                (q, _get_formatted_seq(query2fasta[q].reverse_complement())))
    return

    #separate monoploids
    ##how many monoploid sets
    noFiles = monoploid / haploid
    #define matches on reference global start and stops for each query
    matchesGlobal = {}
    for q, data in q2rBest.iteritems():
        r = data[0]
        qAligned, rGlobStart, rGlobStop = data[1]
        if r not in matchesGlobal:
            matchesGlobal[r] = []
        matchesGlobal[r].append((rGlobStart, rGlobStop, q))
    #sort
    rOut = open("%s.ref.sorted.fa" % outbase, "w")
    qOut = open("%s.query.sorted.fa" % outbase, "w")
    for r in matchesGlobal:
        matchesGlobal[r].sort()

    for r in sorted(matchesGlobal.keys()):
        rOut.write(">%s\n%s\n" % (r, _get_formatted_seq(ref2fasta[r])))
        for rGlobStart, rGlobStop, q in matchesGlobal[r]:
            print r, rGlobStart, rGlobStop, q
            qOut.write(">%s\n%s\n" % (q, _get_formatted_seq(query2fasta[q])))

    rOut.close()
    qOut.close()
    return
    for r in matchesGlobal:
        for rGlobStart, rGlobStop, q in matchesGlobal[r]:
            matchesInRegion = filter(
                lambda x: x[0] < rGlobStart < x[1] or x[0] < rGlobStop < x[1],
                matchesGlobal[r])

            print r, rGlobStart, rGlobStop, q
            print matchesInRegion

    #save to file
    sys.stderr.write("Going to save chromosomes in %s files:\n" % noFiles)
    outfiles = []
    for i in range(noFiles):
        outfn = "%s.%s.fa" % (outbase, i + 1)
        sys.stderr.write("  %s\n" % outfn)
        outfiles.append(open(outfn, "w"))
示例#3
0
def main():
    usage = "usage: %prog [options]"
    desc = """Parse multi-fasta file and report sequences without overlap
with already reported sequences. Starts from the longest."""
    epilog = ""
    parser = OptionParser(usage=usage,
                          version="%prog 1.0",
                          description=desc,
                          epilog=epilog)

    parser.add_option("-i",
                      dest="infile",
                      help="multi-fasta file       [mandatory]")
    parser.add_option("-m",
                      dest="minIdentity",
                      default=90,
                      type=int,
                      help="min identity           [%default]")
    parser.add_option("-o",
                      dest="overlap",
                      default=0.3,
                      type=float,
                      help="max overlap allowed    [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true")

    (o, args) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\nArgs: %s\n" % (o, args))

    for fn in [
            o.infile,
    ]:
        if not fn:
            parser.error("Provide input file!")
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    #load fastas
    fastas = genome2dict(o.infile)

    #contigs by descending length
    contigs = sorted(fastas.keys(), key=lambda x: len(fastas[x]), reverse=True)

    #report non-overlapping
    i = 0
    added, skipped = set(), set()
    ##remove outfile if exists
    outfn = o.infile + ".collapsed_o%s_i%s.fa" % (o.overlap, o.minIdentity)
    if os.path.isfile(outfn):
        os.unlink(outfn)
    ##execute blat vs itself
    pslfn = run_blat(o.infile, o.infile, o.minIdentity, o.verbose)
    matches = parse_blat(pslfn, o.verbose, header=0, skipSelfMatches=1)
    ##add contigs without overlap
    for c in contigs:
        i += 1
        if o.verbose:
            sys.stderr.write(" %3s %20s [ %7.2f kb]\n" %
                             (i, c, len(fastas[c]) / 1000.0))
        #get fasta entry
        fasta = ">%s\n%s\n" % (c, _get_formatted_seq(fastas[c]))
        #save contig if first or if no overlapping already processed
        if not added or not overlapping(c, added, matches, o.overlap,
                                        o.verbose):
            added.add(c)
            out = open(outfn, "a")
            out.write(fasta)
            out.close()
        else:
            skipped.add(c)

    sys.stderr.write(
        "Selected %s [ %7.2f kb] out of %s [ %7.2f kb] contigs.\n" %
        (len(added), sum([len(fastas[c]) for c in added]) / 10.0**3,
         len(fastas), sum([len(fastas[c]) for c in fastas]) / 10.0**3))
示例#4
0
def main():

    usage  = "usage: %prog [options] [ 1> matches.table.txt ]"
    desc   = """Blast has to be run with -m8."""
    epilog = ""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-i", dest="infile",  default="",
                      help="blast output")
    parser.add_option("-j", dest="query",  default="",
                      help="query fasta")
    parser.add_option("-k", dest="target",  default="",
                      help="target fasta")
    parser.add_option("-e", dest="evalue", default=1e-05, type=float,
                      help="E-value cut-off [%default]" )
    parser.add_option("-q", dest="qcov",   default=0.3, type=float,
                      help="query coverage  [%default]")
    parser.add_option("-t", dest="tcov",   default=0, type=float,
                      help="target coverage [%default]")
    #parser.add_option("-s", dest="tsplit", default=3, type=int,
    #                  help="split target name by '|' and print s postition [%default]")        
    parser.add_option("-v", dest="verbose",  default=False, action="store_true" )    
  
    ( o, fnames ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) )

    #check files
    for fn in ( o.infile,o.query,o.target ):
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #queries = get_
    #get sizes of queries and targets
    q2len = get_contig2size( o.query  )
    t2len = get_contig2size( o.target )
    #get significant matches
    matches = parse_blast( o.infile,q2len,t2len,o.evalue,0,0,o.verbose )

    #parse matches
    matches_collapsed = {}
    print "#Query\tTarget\tIndentity\tAlignment length\tMismatches\tGaps\tQuery start\tQuery end\tTarget start\tTarget end\tE-value\tScore\tQuery aligned [%]\tTarget aligned [%]\t"
    for qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches:
        #add qlocus to matches
        if qlocus not in matches_collapsed:
            matches_collapsed[qlocus]={}
        if tlocus not in matches_collapsed[qlocus]:
            matches_collapsed[qlocus][tlocus]=[]
        #store data
        matches_collapsed[qlocus][tlocus].append( (algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov) )

    #
    matched_queries = set()
    for qlocus in sorted( matches_collapsed.keys() ):
        for tlocus in sorted( matches_collapsed[qlocus].keys() ):
            qCov=tCov=0
            for algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches_collapsed[qlocus][tlocus]:
                qCov += qcov
                tCov += tcov

            if qCov<o.qcov or tCov<o.tcov:
                continue
            out = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\t%.1f\n" % (qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qCov*100,tCov*100, )
            sys.stdout.write( out )
            matched_queries.add( qlocus )

    #get with no valid match
    sys.stderr.write( "Queries without valid matches:\n" )
    i = 0
    out = open( o.query + ".nomatch.fa","w" )
    for r in SeqIO.parse( open(o.query),"fasta" ):
        if r.id in matched_queries:
            continue
        i+=1
        line = "%s\t%s\t%s" % (i,r.id,len(r.seq))
        if r.id in matches_collapsed:
            line += "\t%s" % str( matches_collapsed[r.id] )
        sys.stderr.write( line+"\n" )
        #save fasta
        out.write( ">%s\n%s\n" % ( r.id,_get_formatted_seq( r.seq ) ) ) 
示例#5
0
def main():

    usage = "usage: %prog [options] [ 1> matches.table.txt ]"
    desc = """Blast has to be run with -m8."""
    epilog = ""
    parser = OptionParser(usage=usage,
                          version="%prog 1.0",
                          description=desc,
                          epilog=epilog)

    parser.add_option("-i", dest="infile", default="", help="blast output")
    parser.add_option("-j", dest="query", default="", help="query fasta")
    parser.add_option("-k", dest="target", default="", help="target fasta")
    parser.add_option("-e",
                      dest="evalue",
                      default=1e-05,
                      type=float,
                      help="E-value cut-off [%default]")
    parser.add_option("-q",
                      dest="qcov",
                      default=0.3,
                      type=float,
                      help="query coverage  [%default]")
    parser.add_option("-t",
                      dest="tcov",
                      default=0,
                      type=float,
                      help="target coverage [%default]")
    #parser.add_option("-s", dest="tsplit", default=3, type=int,
    #                  help="split target name by '|' and print s postition [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true")

    (o, fnames) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\nArgs: %s\n" % (o, fnames))

    #check files
    for fn in (o.infile, o.query, o.target):
        if not fn:
            parser.error("Provide input file!")
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    #queries = get_
    #get sizes of queries and targets
    q2len = get_contig2size(o.query)
    t2len = get_contig2size(o.target)
    #get significant matches
    matches = parse_blast(o.infile, q2len, t2len, o.evalue, 0, 0, o.verbose)

    #parse matches
    matches_collapsed = {}
    print "#Query\tTarget\tIndentity\tAlignment length\tMismatches\tGaps\tQuery start\tQuery end\tTarget start\tTarget end\tE-value\tScore\tQuery aligned [%]\tTarget aligned [%]\t"
    for qlocus, tlocus, identity, algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score, qcov, tcov in matches:
        #add qlocus to matches
        if qlocus not in matches_collapsed:
            matches_collapsed[qlocus] = {}
        if tlocus not in matches_collapsed[qlocus]:
            matches_collapsed[qlocus][tlocus] = []
        #store data
        matches_collapsed[qlocus][tlocus].append(
            (algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score,
             qcov, tcov))

    #
    matched_queries = set()
    for qlocus in sorted(matches_collapsed.keys()):
        for tlocus in sorted(matches_collapsed[qlocus].keys()):
            qCov = tCov = 0
            for algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score, qcov, tcov in matches_collapsed[
                    qlocus][tlocus]:
                qCov += qcov
                tCov += tcov

            if qCov < o.qcov or tCov < o.tcov:
                continue
            out = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\t%.1f\n" % (
                qlocus,
                tlocus,
                identity,
                algLen,
                mismatches,
                gaps,
                qstart,
                qend,
                tstart,
                tend,
                e,
                score,
                qCov * 100,
                tCov * 100,
            )
            sys.stdout.write(out)
            matched_queries.add(qlocus)

    #get with no valid match
    sys.stderr.write("Queries without valid matches:\n")
    i = 0
    out = open(o.query + ".nomatch.fa", "w")
    for r in SeqIO.parse(open(o.query), "fasta"):
        if r.id in matched_queries:
            continue
        i += 1
        line = "%s\t%s\t%s" % (i, r.id, len(r.seq))
        if r.id in matches_collapsed:
            line += "\t%s" % str(matches_collapsed[r.id])
        sys.stderr.write(line + "\n")
        #save fasta
        out.write(">%s\n%s\n" % (r.id, _get_formatted_seq(r.seq)))
示例#6
0
def sort_hits( matches,query2fasta,ref2fasta,outbase,qOverlapTh,haploid,monoploid,verbose ):
    """Return sorted multifasta of monoploid genomes.
    Contigs are sorted based on reference alignment.
    """
    #get best query to reference pairs
    ##prepare nested dictionary
    q2r = {}
    for q in query2fasta:
        q2r[q] = {}
    ##all query to ref
    for r,rStart,rStop,q,qStart,qStop,identity in matches:
        #for rStart,rStop,q,qStart,qStop,identity in matches[r]:
        qAligned = abs( qStop-qStart )
        #define if forward or reverse alg
        fwd = rev = 0        
        if qStop<qStart:
            rev = qAligned
        else:
            fwd = qAligned
        #store alg info
        if r not in q2r[q]:
            q2r[q][r] = [qAligned,rStart,rStop,fwd,rev]
            continue
        q2r[q][r][0] += qAligned
        if rStart<q2r[q][r][1]:
            q2r[q][r][1] = rStart
        if rStop>q2r[q][r][2]:
            q2r[q][r][2] = rStop
        #add fwd,rev
        q2r[q][r][3] += fwd
        q2r[q][r][4] += rev
        
    ##get best match for each query
    q2rBest = {}
    for q in query2fasta: #qSorted:
        refs = sorted( q2r[q].iteritems(), key=lambda x: q2r[q][x[0]][0], reverse=True )
        #skip contigs without a match or with too small fraction aligned
        if not refs or refs[0][1][0] < qOverlapTh * len(query2fasta[q]):
            continue
        q2rBest[q] = refs[0]
        print q,refs

    qOut = open( "%s.sorted.fa" % outbase,"w" )
    for r,rStart,rStop,q,qStart,qStop,identity in matches:
        if q not in q2rBest:
            continue
        #check if current r is the best for given q
        if r != q2rBest[q][0]:
            continue
        #pop given q from dictionary so it's saved only once
        qAligned,rStart,rStop,fwd,rev = q2rBest.pop(q)[1]
        #save sequence
        if fwd>rev:
            qOut.write( ">%s\n%s\n" % ( q, _get_formatted_seq( query2fasta[q] ) ) )
        #or it's reverse complement if more reverse aligned
        else:
            qOut.write( ">%s|rev\n%s\n" % ( q, _get_formatted_seq( query2fasta[q].reverse_complement() ) ) )
    return
    
    #separate monoploids
    ##how many monoploid sets
    noFiles = monoploid/haploid
    #define matches on reference global start and stops for each query
    matchesGlobal = {}
    for q,data in q2rBest.iteritems():
        r = data[0]
        qAligned,rGlobStart,rGlobStop = data[1]
        if r not in matchesGlobal:
            matchesGlobal[r]=[]
        matchesGlobal[r].append( (rGlobStart,rGlobStop,q) )
    #sort
    rOut = open( "%s.ref.sorted.fa" % outbase,"w" )
    qOut = open( "%s.query.sorted.fa" % outbase,"w" )
    for r in matchesGlobal:
        matchesGlobal[r].sort()

    for r in sorted( matchesGlobal.keys() ):
        rOut.write( ">%s\n%s\n" % ( r, _get_formatted_seq( ref2fasta[r] ) ) )
        for rGlobStart,rGlobStop,q in matchesGlobal[r]:
            print r,rGlobStart,rGlobStop,q
            qOut.write( ">%s\n%s\n" % ( q, _get_formatted_seq( query2fasta[q] ) ) )

    rOut.close()
    qOut.close()
    return
    for r in matchesGlobal:
        for rGlobStart,rGlobStop,q in matchesGlobal[r]:
            matchesInRegion = filter( lambda x: x[0]<rGlobStart<x[1] or x[0]<rGlobStop<x[1], matchesGlobal[r] )

            print r,rGlobStart,rGlobStop,q
            print matchesInRegion
            
    
    #save to file
    sys.stderr.write( "Going to save chromosomes in %s files:\n" % noFiles )
    outfiles=[]
    for i in range( noFiles ):
        outfn = "%s.%s.fa" % (outbase,i+1)
        sys.stderr.write( "  %s\n" % outfn )
        outfiles.append( open( outfn,"w" ) )