def main(): usage = "usage: %prog [options]" desc = """Parse multi-fasta file and report sequences without overlap with already reported sequences. Starts from the longest.""" epilog = "" parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) parser.add_option("-i", dest="infile", help="multi-fasta file [mandatory]") parser.add_option("-m", dest="minIdentity", default=90, type=int, help="min identity [%default]") parser.add_option("-o", dest="overlap", default=0.3, type=float, help="max overlap allowed [%default]") parser.add_option("-v", dest="verbose", default=False, action="store_true" ) ( o, args ) = parser.parse_args() if o.verbose: sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,args ) ) for fn in [ o.infile, ]: if not fn: parser.error( "Provide input file!" ) if not os.path.isfile( fn ): parser.error( "No such file: %s" % fn ) #load fastas fastas = genome2dict( o.infile ) #contigs by descending length contigs = sorted( fastas.keys(),key=lambda x: len(fastas[x]), reverse=True ) #report non-overlapping i = 0 added,skipped = set(), set() ##remove outfile if exists outfn = o.infile + ".collapsed_o%s_i%s.fa" % ( o.overlap,o.minIdentity ) if os.path.isfile( outfn ): os.unlink( outfn ) ##execute blat vs itself pslfn = run_blat( o.infile,o.infile,o.minIdentity,o.verbose ) matches = parse_blat( pslfn,o.verbose,header=0,skipSelfMatches=1 ) ##add contigs without overlap for c in contigs: i += 1 if o.verbose: sys.stderr.write( " %3s %20s [ %7.2f kb]\n" % (i,c,len(fastas[c])/1000.0) ) #get fasta entry fasta = ">%s\n%s\n" % (c,_get_formatted_seq(fastas[c])) #save contig if first or if no overlapping already processed if not added or not overlapping( c,added,matches,o.overlap,o.verbose ): added.add( c ) out = open(outfn,"a"); out.write( fasta ); out.close() else: skipped.add( c ) sys.stderr.write( "Selected %s [ %7.2f kb] out of %s [ %7.2f kb] contigs.\n" % ( len(added),sum([len(fastas[c]) for c in added])/10.0**3,len(fastas),sum([len(fastas[c]) for c in fastas])/10.0**3) )
def sort_hits(matches, query2fasta, ref2fasta, outbase, qOverlapTh, haploid, monoploid, verbose): """Return sorted multifasta of monoploid genomes. Contigs are sorted based on reference alignment. """ #get best query to reference pairs ##prepare nested dictionary q2r = {} for q in query2fasta: q2r[q] = {} ##all query to ref for r, rStart, rStop, q, qStart, qStop, identity in matches: #for rStart,rStop,q,qStart,qStop,identity in matches[r]: qAligned = abs(qStop - qStart) #define if forward or reverse alg fwd = rev = 0 if qStop < qStart: rev = qAligned else: fwd = qAligned #store alg info if r not in q2r[q]: q2r[q][r] = [qAligned, rStart, rStop, fwd, rev] continue q2r[q][r][0] += qAligned if rStart < q2r[q][r][1]: q2r[q][r][1] = rStart if rStop > q2r[q][r][2]: q2r[q][r][2] = rStop #add fwd,rev q2r[q][r][3] += fwd q2r[q][r][4] += rev ##get best match for each query q2rBest = {} for q in query2fasta: #qSorted: refs = sorted(q2r[q].iteritems(), key=lambda x: q2r[q][x[0]][0], reverse=True) #skip contigs without a match or with too small fraction aligned if not refs or refs[0][1][0] < qOverlapTh * len(query2fasta[q]): continue q2rBest[q] = refs[0] print q, refs qOut = open("%s.sorted.fa" % outbase, "w") for r, rStart, rStop, q, qStart, qStop, identity in matches: if q not in q2rBest: continue #check if current r is the best for given q if r != q2rBest[q][0]: continue #pop given q from dictionary so it's saved only once qAligned, rStart, rStop, fwd, rev = q2rBest.pop(q)[1] #save sequence if fwd > rev: qOut.write(">%s\n%s\n" % (q, _get_formatted_seq(query2fasta[q]))) #or it's reverse complement if more reverse aligned else: qOut.write( ">%s|rev\n%s\n" % (q, _get_formatted_seq(query2fasta[q].reverse_complement()))) return #separate monoploids ##how many monoploid sets noFiles = monoploid / haploid #define matches on reference global start and stops for each query matchesGlobal = {} for q, data in q2rBest.iteritems(): r = data[0] qAligned, rGlobStart, rGlobStop = data[1] if r not in matchesGlobal: matchesGlobal[r] = [] matchesGlobal[r].append((rGlobStart, rGlobStop, q)) #sort rOut = open("%s.ref.sorted.fa" % outbase, "w") qOut = open("%s.query.sorted.fa" % outbase, "w") for r in matchesGlobal: matchesGlobal[r].sort() for r in sorted(matchesGlobal.keys()): rOut.write(">%s\n%s\n" % (r, _get_formatted_seq(ref2fasta[r]))) for rGlobStart, rGlobStop, q in matchesGlobal[r]: print r, rGlobStart, rGlobStop, q qOut.write(">%s\n%s\n" % (q, _get_formatted_seq(query2fasta[q]))) rOut.close() qOut.close() return for r in matchesGlobal: for rGlobStart, rGlobStop, q in matchesGlobal[r]: matchesInRegion = filter( lambda x: x[0] < rGlobStart < x[1] or x[0] < rGlobStop < x[1], matchesGlobal[r]) print r, rGlobStart, rGlobStop, q print matchesInRegion #save to file sys.stderr.write("Going to save chromosomes in %s files:\n" % noFiles) outfiles = [] for i in range(noFiles): outfn = "%s.%s.fa" % (outbase, i + 1) sys.stderr.write(" %s\n" % outfn) outfiles.append(open(outfn, "w"))
def main(): usage = "usage: %prog [options]" desc = """Parse multi-fasta file and report sequences without overlap with already reported sequences. Starts from the longest.""" epilog = "" parser = OptionParser(usage=usage, version="%prog 1.0", description=desc, epilog=epilog) parser.add_option("-i", dest="infile", help="multi-fasta file [mandatory]") parser.add_option("-m", dest="minIdentity", default=90, type=int, help="min identity [%default]") parser.add_option("-o", dest="overlap", default=0.3, type=float, help="max overlap allowed [%default]") parser.add_option("-v", dest="verbose", default=False, action="store_true") (o, args) = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\nArgs: %s\n" % (o, args)) for fn in [ o.infile, ]: if not fn: parser.error("Provide input file!") if not os.path.isfile(fn): parser.error("No such file: %s" % fn) #load fastas fastas = genome2dict(o.infile) #contigs by descending length contigs = sorted(fastas.keys(), key=lambda x: len(fastas[x]), reverse=True) #report non-overlapping i = 0 added, skipped = set(), set() ##remove outfile if exists outfn = o.infile + ".collapsed_o%s_i%s.fa" % (o.overlap, o.minIdentity) if os.path.isfile(outfn): os.unlink(outfn) ##execute blat vs itself pslfn = run_blat(o.infile, o.infile, o.minIdentity, o.verbose) matches = parse_blat(pslfn, o.verbose, header=0, skipSelfMatches=1) ##add contigs without overlap for c in contigs: i += 1 if o.verbose: sys.stderr.write(" %3s %20s [ %7.2f kb]\n" % (i, c, len(fastas[c]) / 1000.0)) #get fasta entry fasta = ">%s\n%s\n" % (c, _get_formatted_seq(fastas[c])) #save contig if first or if no overlapping already processed if not added or not overlapping(c, added, matches, o.overlap, o.verbose): added.add(c) out = open(outfn, "a") out.write(fasta) out.close() else: skipped.add(c) sys.stderr.write( "Selected %s [ %7.2f kb] out of %s [ %7.2f kb] contigs.\n" % (len(added), sum([len(fastas[c]) for c in added]) / 10.0**3, len(fastas), sum([len(fastas[c]) for c in fastas]) / 10.0**3))
def main(): usage = "usage: %prog [options] [ 1> matches.table.txt ]" desc = """Blast has to be run with -m8.""" epilog = "" parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) parser.add_option("-i", dest="infile", default="", help="blast output") parser.add_option("-j", dest="query", default="", help="query fasta") parser.add_option("-k", dest="target", default="", help="target fasta") parser.add_option("-e", dest="evalue", default=1e-05, type=float, help="E-value cut-off [%default]" ) parser.add_option("-q", dest="qcov", default=0.3, type=float, help="query coverage [%default]") parser.add_option("-t", dest="tcov", default=0, type=float, help="target coverage [%default]") #parser.add_option("-s", dest="tsplit", default=3, type=int, # help="split target name by '|' and print s postition [%default]") parser.add_option("-v", dest="verbose", default=False, action="store_true" ) ( o, fnames ) = parser.parse_args() if o.verbose: sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) ) #check files for fn in ( o.infile,o.query,o.target ): if not fn: parser.error( "Provide input file!" ) if not os.path.isfile( fn ): parser.error( "No such file: %s" % fn ) #queries = get_ #get sizes of queries and targets q2len = get_contig2size( o.query ) t2len = get_contig2size( o.target ) #get significant matches matches = parse_blast( o.infile,q2len,t2len,o.evalue,0,0,o.verbose ) #parse matches matches_collapsed = {} print "#Query\tTarget\tIndentity\tAlignment length\tMismatches\tGaps\tQuery start\tQuery end\tTarget start\tTarget end\tE-value\tScore\tQuery aligned [%]\tTarget aligned [%]\t" for qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches: #add qlocus to matches if qlocus not in matches_collapsed: matches_collapsed[qlocus]={} if tlocus not in matches_collapsed[qlocus]: matches_collapsed[qlocus][tlocus]=[] #store data matches_collapsed[qlocus][tlocus].append( (algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov) ) # matched_queries = set() for qlocus in sorted( matches_collapsed.keys() ): for tlocus in sorted( matches_collapsed[qlocus].keys() ): qCov=tCov=0 for algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches_collapsed[qlocus][tlocus]: qCov += qcov tCov += tcov if qCov<o.qcov or tCov<o.tcov: continue out = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\t%.1f\n" % (qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qCov*100,tCov*100, ) sys.stdout.write( out ) matched_queries.add( qlocus ) #get with no valid match sys.stderr.write( "Queries without valid matches:\n" ) i = 0 out = open( o.query + ".nomatch.fa","w" ) for r in SeqIO.parse( open(o.query),"fasta" ): if r.id in matched_queries: continue i+=1 line = "%s\t%s\t%s" % (i,r.id,len(r.seq)) if r.id in matches_collapsed: line += "\t%s" % str( matches_collapsed[r.id] ) sys.stderr.write( line+"\n" ) #save fasta out.write( ">%s\n%s\n" % ( r.id,_get_formatted_seq( r.seq ) ) )
def main(): usage = "usage: %prog [options] [ 1> matches.table.txt ]" desc = """Blast has to be run with -m8.""" epilog = "" parser = OptionParser(usage=usage, version="%prog 1.0", description=desc, epilog=epilog) parser.add_option("-i", dest="infile", default="", help="blast output") parser.add_option("-j", dest="query", default="", help="query fasta") parser.add_option("-k", dest="target", default="", help="target fasta") parser.add_option("-e", dest="evalue", default=1e-05, type=float, help="E-value cut-off [%default]") parser.add_option("-q", dest="qcov", default=0.3, type=float, help="query coverage [%default]") parser.add_option("-t", dest="tcov", default=0, type=float, help="target coverage [%default]") #parser.add_option("-s", dest="tsplit", default=3, type=int, # help="split target name by '|' and print s postition [%default]") parser.add_option("-v", dest="verbose", default=False, action="store_true") (o, fnames) = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\nArgs: %s\n" % (o, fnames)) #check files for fn in (o.infile, o.query, o.target): if not fn: parser.error("Provide input file!") if not os.path.isfile(fn): parser.error("No such file: %s" % fn) #queries = get_ #get sizes of queries and targets q2len = get_contig2size(o.query) t2len = get_contig2size(o.target) #get significant matches matches = parse_blast(o.infile, q2len, t2len, o.evalue, 0, 0, o.verbose) #parse matches matches_collapsed = {} print "#Query\tTarget\tIndentity\tAlignment length\tMismatches\tGaps\tQuery start\tQuery end\tTarget start\tTarget end\tE-value\tScore\tQuery aligned [%]\tTarget aligned [%]\t" for qlocus, tlocus, identity, algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score, qcov, tcov in matches: #add qlocus to matches if qlocus not in matches_collapsed: matches_collapsed[qlocus] = {} if tlocus not in matches_collapsed[qlocus]: matches_collapsed[qlocus][tlocus] = [] #store data matches_collapsed[qlocus][tlocus].append( (algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score, qcov, tcov)) # matched_queries = set() for qlocus in sorted(matches_collapsed.keys()): for tlocus in sorted(matches_collapsed[qlocus].keys()): qCov = tCov = 0 for algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score, qcov, tcov in matches_collapsed[ qlocus][tlocus]: qCov += qcov tCov += tcov if qCov < o.qcov or tCov < o.tcov: continue out = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\t%.1f\n" % ( qlocus, tlocus, identity, algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score, qCov * 100, tCov * 100, ) sys.stdout.write(out) matched_queries.add(qlocus) #get with no valid match sys.stderr.write("Queries without valid matches:\n") i = 0 out = open(o.query + ".nomatch.fa", "w") for r in SeqIO.parse(open(o.query), "fasta"): if r.id in matched_queries: continue i += 1 line = "%s\t%s\t%s" % (i, r.id, len(r.seq)) if r.id in matches_collapsed: line += "\t%s" % str(matches_collapsed[r.id]) sys.stderr.write(line + "\n") #save fasta out.write(">%s\n%s\n" % (r.id, _get_formatted_seq(r.seq)))
def sort_hits( matches,query2fasta,ref2fasta,outbase,qOverlapTh,haploid,monoploid,verbose ): """Return sorted multifasta of monoploid genomes. Contigs are sorted based on reference alignment. """ #get best query to reference pairs ##prepare nested dictionary q2r = {} for q in query2fasta: q2r[q] = {} ##all query to ref for r,rStart,rStop,q,qStart,qStop,identity in matches: #for rStart,rStop,q,qStart,qStop,identity in matches[r]: qAligned = abs( qStop-qStart ) #define if forward or reverse alg fwd = rev = 0 if qStop<qStart: rev = qAligned else: fwd = qAligned #store alg info if r not in q2r[q]: q2r[q][r] = [qAligned,rStart,rStop,fwd,rev] continue q2r[q][r][0] += qAligned if rStart<q2r[q][r][1]: q2r[q][r][1] = rStart if rStop>q2r[q][r][2]: q2r[q][r][2] = rStop #add fwd,rev q2r[q][r][3] += fwd q2r[q][r][4] += rev ##get best match for each query q2rBest = {} for q in query2fasta: #qSorted: refs = sorted( q2r[q].iteritems(), key=lambda x: q2r[q][x[0]][0], reverse=True ) #skip contigs without a match or with too small fraction aligned if not refs or refs[0][1][0] < qOverlapTh * len(query2fasta[q]): continue q2rBest[q] = refs[0] print q,refs qOut = open( "%s.sorted.fa" % outbase,"w" ) for r,rStart,rStop,q,qStart,qStop,identity in matches: if q not in q2rBest: continue #check if current r is the best for given q if r != q2rBest[q][0]: continue #pop given q from dictionary so it's saved only once qAligned,rStart,rStop,fwd,rev = q2rBest.pop(q)[1] #save sequence if fwd>rev: qOut.write( ">%s\n%s\n" % ( q, _get_formatted_seq( query2fasta[q] ) ) ) #or it's reverse complement if more reverse aligned else: qOut.write( ">%s|rev\n%s\n" % ( q, _get_formatted_seq( query2fasta[q].reverse_complement() ) ) ) return #separate monoploids ##how many monoploid sets noFiles = monoploid/haploid #define matches on reference global start and stops for each query matchesGlobal = {} for q,data in q2rBest.iteritems(): r = data[0] qAligned,rGlobStart,rGlobStop = data[1] if r not in matchesGlobal: matchesGlobal[r]=[] matchesGlobal[r].append( (rGlobStart,rGlobStop,q) ) #sort rOut = open( "%s.ref.sorted.fa" % outbase,"w" ) qOut = open( "%s.query.sorted.fa" % outbase,"w" ) for r in matchesGlobal: matchesGlobal[r].sort() for r in sorted( matchesGlobal.keys() ): rOut.write( ">%s\n%s\n" % ( r, _get_formatted_seq( ref2fasta[r] ) ) ) for rGlobStart,rGlobStop,q in matchesGlobal[r]: print r,rGlobStart,rGlobStop,q qOut.write( ">%s\n%s\n" % ( q, _get_formatted_seq( query2fasta[q] ) ) ) rOut.close() qOut.close() return for r in matchesGlobal: for rGlobStart,rGlobStop,q in matchesGlobal[r]: matchesInRegion = filter( lambda x: x[0]<rGlobStart<x[1] or x[0]<rGlobStop<x[1], matchesGlobal[r] ) print r,rGlobStart,rGlobStop,q print matchesInRegion #save to file sys.stderr.write( "Going to save chromosomes in %s files:\n" % noFiles ) outfiles=[] for i in range( noFiles ): outfn = "%s.%s.fa" % (outbase,i+1) sys.stderr.write( " %s\n" % outfn ) outfiles.append( open( outfn,"w" ) )