def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]") parser.add_option("-f", "--format", dest="format", type="string", help="output format [Default=%default]") parser.add_option( "-e", "--expand", dest="expand", action="store_true", help= "expand positions from peptide to nucleotide alignment [Default=%default]" ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option("-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help= "expect one-based coordinates. The default are zero based coordinates [Default=%default]." ) parser.add_option("--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]") parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option("-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option("-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help= "given a set of previous alignments, only write new pairs [Default=%default]." ) parser.set_defaults(filename_sequences=None, filename_exons=None, filename_map=None, filename_outfile=None, no_gaps=False, format="fasta", expand=False, require_codons=False, no_identical=False, min_length=0, report_step=100, one_based_coordinates=False, filename_filter=None) (options, args) = E.Start(parser, add_mysql_options=True) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r")) else: sequences = {} if options.loglevel >= 1: options.stdlog.write("# read %i sequences\n" % len(sequences)) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r")) else: exons = {} if options.loglevel >= 1: options.stdlog.write("# read %i exons\n" % len(exons)) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read(line) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write("# read %i maps\n" % len(map_old2new)) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write("# reading filtering information.\n") sys.stdout.flush() map_pair2hids = {} if os.path.exists(options.filename_filter): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator(infile) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append(s) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids)) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write("# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links(sys.stdin): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write("# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1)) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write("# read link %s\n" % str(link)) row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken]) col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken]) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment(link.mQueryAli, 3) link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(map_row2col) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in row with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mQueryToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in col with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mSbjctToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError( "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" % (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) + "\n") # check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write("# %s\n" % str(map_row2col)) options.stdlog.write("# %s\n" % str(link)) options.stdlog.write("# %s\n" % str(map_old2new[link.mQueryToken])) options.stdlog.write("# %s\n" % str(map_old2new[link.mSbjctToken])) options.stdlog.write("#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) raise ValueError( "incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) # if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] # Get overlapping segments segments = Exons.MatchExons(map_row2col, exons1, exons2) for a, b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in # the input files. from1, to1 = GetAdjustedBoundaries(a, exons1) from2, to2 = GetAdjustedBoundaries(b, exons2) alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col, from1 + 1, to1, from2 + 1, to2) mode = Write(tmp1_map_row2col, row_seq, col_seq, link, no_gaps=options.no_gaps, no_identical=options.no_identical, min_length=options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile=outfile, pair_filter=map_pair2hid, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write(map_row2col, row_seq, col_seq, link, min_length=options.min_length, no_gaps=options.no_gaps, no_identical=options.no_identical, outfile=outfile, pair_filter=map_pair2hids, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map(lambda x, y: "%s=%i" % (x, y), counts.keys(), counts.values()))) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
if param_loglevel >= 1: print "# read %i cds" % (len(cds)) sys.stdout.flush() ninput, npairs, nskipped = 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue if line[0] == ">": print line[:-1] continue ninput += 1 link = BlastAlignments.Link() link.Read(line) if link.mQueryToken == link.mSbjctToken: continue keep = 1 if link.mQueryToken in cds and link.mSbjctToken in cds: is_paralog, reason = IsParalogLink(link, cds[link.mQueryToken], cds[link.mSbjctToken]) if is_paralog: keep = 0 if param_loglevel >= 2: print "# DISCARDED because %s: %s" % (reason, str(link)) else:
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser.add_option("-n", "--vertices", dest="vertices", action="append", help="filename with vertices.") parser.add_option("-e", "--extra", dest="filename_extra", type="string", help="filename to store extra vertices in.") parser.add_option("-m", "--missed", dest="filename_missed", type="string", help="filename to store missed vertices in.") parser.set_defaults( vertices=[], filename_extra=None, filename_missed=None, ) (options, args) = E.Start(parser) if len(options.vertices) == "": raise "please specify one set of vertices." vertices = {} index = 0 missed_queries = [] nvertices = [0] * len(options.vertices) for x in range(len(options.vertices)): f = options.vertices[x] vv = map(lambda x: x[:-1].split("\t")[0], filter(lambda x: x[0] != "#", open(f, "r").readlines())) nvertices[x] = len(vv) missed_queries.append([]) for v in vv: vertices[v] = [x, 0, 0] if options.loglevel >= 1: print "# read %i vertices from %s" % (len(vv), f) sys.stdout.flush() l = BlastAlignments.Link() extra_vertices = {} for line in sys.stdin: if line[0] == "#": continue l.Read(line) if l.mQueryToken in vertices: vertices[l.mQueryToken][1] += 1 else: extra_vertices[l.mQueryToken] = 1 if l.mSbjctToken in vertices: vertices[l.mSbjctToken][2] += 1 else: extra_vertices[l.mSbjctToken] = 1 found_queries = [0] * len(options.vertices) found_sbjcts = [0] * len(options.vertices) for v, vv in vertices.items(): index, nquery, nsbjct = vv if nquery: found_queries[index] += 1 else: missed_queries[index].append(v) if nsbjct: found_sbjcts[index] += 1 headers = ("set", "name", "tvertex", "nmissed", "pmissed", "nquery", "pquery", "nsbjct", "psbjct") print "\t".join(headers) for x in range(len(options.vertices)): print "%i\t%s\t%i\t%i\t%5.2f\t%i\t%5.2f\t%i\t%5.2f" % ( x, options.vertices[x], nvertices[x], len(missed_queries[x]), 100 * float(len(missed_queries[x])) / nvertices[x], found_queries[x], 100 * float(found_queries[x]) / nvertices[x], found_sbjcts[x], 100 * float(found_sbjcts[x]) / nvertices[x]) print "//" print "%i vertices not in set" % len(extra_vertices) if options.filename_extra and len(extra_vertices) > 0: outfile = open(options.filename_extra, "w") for x in extra_vertices.keys(): outfile.write("%s\n" % x) outfile.close() if options.filename_missed: outfile = open(options.filename_missed, "w") for x in range(len(options.vertices)): for y in missed_queries[x]: outfile.write("%i\t%s\t%s\n" % (x, options.vertices[x], y)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: graph_map_links.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-q", "--map-query", dest="filename_map_query", type="string", help="filename with queries to map.") parser.add_option("-s", "--map-sbjct", dest="filename_map_sbjct", type="string", help="filename with sbjcts to map.") parser.add_option("-m", "--multiple", dest="multiple", action="store_true", help="map multiple options [%default].") parser.add_option("-k", "--keep-unmapped", dest="keep_unmapped", action="store_true", help="keep unmapped entries [%default].") parser.add_option("-i", "--map-identity", dest="map_identity", action="store_true", help="map by identifier [%default].") parser.add_option( "-n", "--non-redundant", dest="non_redundant", action="store_true", help= "write only unique links (requires a lot of memory for large graphs) [%default]" ) parser.set_defaults( \ filename_map_query = None, filename_map_sbjct = None, multiple = False, keep_unmapped = False, map_identity = False, report_step = 1000000, non_redundant = False) (options, args) = E.Start(parser) if options.filename_map_query: infile = IOTools.openFile(options.filename_map_query, "r") if options.map_identity: map_query = readIdentityMap(infile) else: map_query = BlastAlignments.ReadMap(infile, options.multiple) infile.close() E.info('read maps for %i queries' % len(map_query)) else: map_query = None if options.filename_map_sbjct: if options.filename_map_sbjct == options.filename_map_query: map_sbjct = map_query else: infile = IOTools.openFile(options.filename_map_sbjct, "r") if options.map_identity: map_sbjct = readIdentityMap(infile) else: map_sbjct = BlastAlignments.ReadMap(infile, options.multiple) infile.close() E.info('read maps for %i sbjcts' % len(map_sbjct)) else: map_sbjct = None nfailed = 0 ninput = 0 nskipped = 0 noutput = 0 # number of identical/mapped links nsame, nmapped = 0, 0 printed = {} alignment = BlastAlignments.Map() for line in options.stdin: if line[0] == "#": continue data = line[:-1].split("\t") alignment.Read(line) skip = False ninput += 1 E.debug(str(map)) if options.loglevel >= 2 and ninput % options.report_step == 0: options.stderr.write( "# progress: ninput=%i, noutput=%i, nhash=%i\n" % (ninput, noutput, len(printed))) if options.multiple: skip = False if map_query != None: if alignment.mQueryToken in map_query: mq = map_query[alignment.mQueryToken] else: skip = True else: mq = [None] if map_sbjct != None: if alignment.mSbjctToken in map_sbjct: ms = map_sbjct[alignment.mSbjctToken] else: skip = True else: ms = [None] if skip: nskipped += 1 continue if options.map_identity: ## only if non_redundant is set, do global comparison if not options.non_redundant: printed = {} new_map = alignment.GetClone() do_redundant = len(mq) > 1 or len(ms) > 1 for q in mq: for s in ms: new_map.mQueryToken = q new_map.mSbjctToken = s ## check for non-redundant links for 1:many or many:many mappings if do_redundant: key = "%s-%i-%i-%s-%i-%i" % ( new_map.mQueryToken, new_map.mQueryFrom, new_map.mQueryTo, new_map.mSbjctToken, new_map.mSbjctFrom, new_map.mSbjctTo) # hash key to save space hkey = hashlib.md5(key).digest() if hkey in printed: continue printed[hkey] = 1 options.stdout.write('\t'.join([str(new_map)] + data[9:]) + '\n') noutput += 1 if new_map.mQueryToken == alignment.mQueryToken and \ new_map.mSbjctToken == alignment.mSbjctToken: nsame += 1 else: nmapped += 1 else: for q in mq: for s in ms: new_map = alignment.GetClone() E.debug(str(q)) E.debug(str(s)) is_ok = new_map.MapAlignment(q, s) if not is_ok: nfailed += 1 else: options.stdout.write('\t'.join([str(new_map)] + data[9:]) + '\n') noutput += 1 # options.multiple == False else: if map_query != None: if alignment.mQueryToken in map_query: mq = map_query[alignment.mQueryToken] else: mq = None skip = True else: mq = None if map_sbjct != None: if alignment.mSbjctToken in map_sbjct: ms = map_sbjct[alignment.mSbjctToken] else: ms = None skip = True else: ms = None if skip and not options.keep_unmapped: nskipped += 1 continue E.debug(str(mq)) E.debug(str(ms)) if mq or ms: is_ok = alignment.MapAlignment(mq, ms) else: is_ok = True if not is_ok: nfailed += 1 else: options.stdout.write('\t'.join([str(alignment)] + data[9:]) + '\n') noutput += 1 E.info( 'ninput=%i, noutput=%i, nskipped=%i, nfailed=%i, nsame=%i, nmapped=%i' % \ (ninput, noutput, nskipped, nfailed, nsame, nmapped )) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]" ) parser.add_option( "-f", "--format", dest="format", type="string", help="output format [Default=%default]" ) parser.add_option( "-e", "--expand", dest="expand", action="store_true", help="expand positions from peptide to nucleotide alignment [Default=%default]") parser.add_option( "-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option( "-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help="expect one-based coordinates. The default are zero based coordinates [Default=%default].") parser.add_option( "--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]" ) parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option( "-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option( "-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option( "--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help="given a set of previous alignments, only write new pairs [Default=%default].") parser.set_defaults( filename_sequences = None, filename_exons = None, filename_map = None, filename_outfile = None, no_gaps = False, format = "fasta", expand = False, require_codons = False, no_identical = False, min_length = 0, report_step = 100, one_based_coordinates = False, filename_filter = None) (options, args) = E.Start( parser, add_mysql_options = True ) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") ) else: sequences = {} if options.loglevel >= 1: options.stdlog.write( "# read %i sequences\n" % len(sequences) ) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries( open(options.filename_exons, "r") ) else: exons = {} if options.loglevel >= 1: options.stdlog.write( "# read %i exons\n" % len(exons) ) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read( line ) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write( "# read %i maps\n" % len(map_old2new) ) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write( "# reading filtering information.\n" ) sys.stdout.flush() map_pair2hids = {} if os.path.exists( options.filename_filter ): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator( infile ) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append( s ) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids) ) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write( "# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links( sys.stdin ): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write( "# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1) ) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write( "# read link %s\n" % str(link) ) row_seq = alignlib_lite.py_makeSequence( sequences[link.mQueryToken] ) col_seq = alignlib_lite.py_makeSequence( sequences[link.mSbjctToken] ) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment( link.mQueryAli, 3 ) link.mSbjctAli = ScaleAlignment( link.mSbjctAli, 3 ) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli ).copy( map_row2col ) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in row with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mQueryToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR ) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in col with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mSbjctToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR ) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError("out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" %\ (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq )) + "\n" ) ## check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write( "# %s\n" % str(map_row2col) ) options.stdlog.write( "# %s\n" % str(link) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mQueryToken]) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mSbjctToken]) ) options.stdlog.write( "#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq ) ) raise ValueError("incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) ## if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] ## Get overlapping segments segments = Exons.MatchExons( map_row2col, exons1, exons2 ) for a,b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in the input files. from1, to1 = GetAdjustedBoundaries( a, exons1 ) from2, to2 = GetAdjustedBoundaries( b, exons2 ) alignlib_lite.py_copyAlignment( tmp1_map_row2col, map_row2col, from1+1, to1, from2+1, to2 ) mode = Write( tmp1_map_row2col, row_seq, col_seq, link, no_gaps = options.no_gaps, no_identical = options.no_identical, min_length = options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile = outfile, pair_filter = map_pair2hid, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write( map_row2col, row_seq, col_seq, link, min_length = options.min_length, no_gaps = options.no_gaps, no_identical = options.no_identical, outfile = outfile, pair_filter = map_pair2hids, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map( lambda x,y: "%s=%i" % (x,y), counts.keys(), counts.values() ) )) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped) ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: blast2fasta.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--sequences", dest="filename_sequences", type="string", help="filename with sequences.") parser.add_option("-f", "--format", dest="format", type="string", help="output format.") parser.set_defaults( filename_sequences=None, format="fasta", ) (options, args) = E.Start(parser) if not options.filename_sequences: raise "please supply filename with sequences." sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r")) if options.loglevel >= 1: print "# read %i sequences" % len(sequences) for k in sequences.keys(): sequences[k] = alignlib_lite.py_makeSequence(sequences[k]) if options.loglevel >= 2: print "# converted %i sequences" % len(sequences) ninput, noutput, nskipped, nfailed = 0, 0, 0, 0 link = BlastAlignments.Link() ali = alignlib_lite.py_makeAlignataVector() for line in sys.stdin: if line[0] == "#": continue link.Read(line) ninput += 1 if link.mQueryToken not in sequences or link.mSbjctToken not in sequences: nskipped += 1 continue ali.Clear() alignlib_lite.py_fillAlignataCompressed(ali, link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli) result = alignlib_lite.py_writePairAlignment( sequences[link.mQueryToken], sequences[link.mSbjctToken], ali).split("\n") if len(result) != 3: nfailed += 1 if options.format == "fasta": print ">%s %i-%i\n%s\n>%s %i-%i\n%s\n" %\ (link.mQueryToken, link.mQueryFrom, link.mQueryTo, result[0].split("\t")[1], link.mSbjctToken, link.mSbjctFrom, link.mSbjctTo, result[1].split("\t")[1] ) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, nfailed=%i" % (ninput, noutput, nskipped, nfailed)) E.Stop()