def pslSelectQuery(options): ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 value, field = options.select.split("-") if field == "nmatches": f = lambda x: x.mNMatches elif field == "nmismatches": f = lambda x: x.mNMisMatches for data in Blat.iterator_per_query(Blat.iterator(options.stdin)): ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) data.sort(key=f) if value == "most": options.stdout.write("%s\n" % str(data[-1])) elif value == "least": options.stdout.write("%s\n" % str(data[0])) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def pslAddSequence(query_fasta, sbjct_fasta, options): iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 while 1: match = next(iterator) if not match: break ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) new = Blat.MatchPSLX() new.fromPSL(match, query_fasta.getSequence( match.mQueryId, "+", match.mQueryFrom, match.mQueryTo), sbjct_fasta.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo)) options.stdout.write(str(new) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def iterator_psl_intervals(options): """iterate over psl file yield an entry together with overlapping entries. returns tuples of (match, list(query_intervals), list(target_intervals)) """ if options.filename_filter_query: intervals_query = readIntervals( IOTools.openFile(options.filename_filter_query, "r"), options) else: intervals_query = None if options.filename_filter_target: intervals_target = readIntervals( IOTools.openFile(options.filename_filter_target, "r"), options) else: intervals_target = None iterator = Blat.BlatIterator(options.stdin) ninput = 0 while 1: match = iterator.next() if not match: break ninput += 1 if options.test and ninput >= options.test: break if options.loglevel >= 1 and ninput % options.report_step == 0: options.stdlog.write("# progress: ninput=%i\n" % (ninput)) options.stdlog.flush() qx, tx = None, None if intervals_query: try: qx = list( intervals_query.get(match.mQueryId, match.mQueryFrom, match.mQueryTo)) except KeyError: qx = [] if intervals_target: try: tx = list( intervals_target.get(match.mSbjctId, match.mSbjctFrom, match.mSbjctTo)) except KeyError: tx = [] if options.loglevel >= 2: options.stdlog.write( "###################################################\n") options.stdlog.write("# testing match %s\n" % (str(match))) options.stdlog.write( "###################################################\n") yield match, qx, tx
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: psl2chain.py 2901 2010-04-13 14:38:07Z andreas $", usage=globals()["__doc__"] ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ## do sth ninput, nskipped, noutput = 0, 0, 0 for psl in Blat.iterator(options.stdin): ninput += 1 if psl.strand == "-": qstart, qend = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom else: qstart, qend = psl.mQueryFrom, psl.mQueryTo options.stdout.write( "chain %i %s %i %s %i %i %s %i %s %i %i %i\n" % ( psl.mNMatches, psl.mSbjctId, psl.mSbjctLength, "+", psl.mSbjctFrom, psl.mSbjctTo, psl.mQueryId, psl.mQueryLength, psl.strand, qstart, qend, ninput, ) ) size, tend, qend = 0, None, None for qstart, tstart, size in psl.getBlocks(): if tend != None: options.stdout.write("\t%i\t%i\n" % (tstart - tend, qstart - qend)) qend, tend = qstart + size, tstart + size options.stdout.write("%i" % (size,)) options.stdout.write("\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $", usage=globals()["__doc__"]) parser.add_option("-q", "--query", dest="query", type="string", help="sequence to use for query [default=%default].") parser.add_option("-t", "--target", dest="target", type="string", help="sequence to use for target [default=%default].") parser.set_defaults( query=None, target=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.query is None or options.target is None: if len(args) != 2: raise ValueError( "please supply two sequence identifiers for query and target") options.query, options.target = args # do sth ninput, nskipped, noutput = 0, 0, 0 reader = maf.Reader(options.stdin) psl = Blat.Match() for cc in threaditer(reader, (options.query, options.target)): ninput += 1 query, target = cc # treat identfiers like Hsap.GL000223.1 try: data = query.src.split(".") qs, qcontig = data[0], ".".join(data[1:]) except ValueError, msg: raise ValueError( "error: could not parse query %s: msg=%s" % (query.src, msg)) try: data = target.src.split(".") ts, tcontig = data[0], ".".join(data[1:]) except ValueError, msg: raise ValueError( "error: could not parse target %s: msg=%s" % (target.src, msg))
def pslComplement(query_fasta, target_fasta, options): """complenment psl entries. """ iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 border = options.complement_border min_length = options.complement_min_length while 1: match = iterator.next() if not match: break ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) if match.mNBlocks <= 1: nskipped += 1 continue pairs = [] for qstart, tstart, size in match.getBlocks(): qend = qstart + size - border qstart += border if qend - qstart < options.complement_min_length: continue tend = tstart + size - border tstart += border if tend - tstart < options.complement_min_length: continue query_sequence = query_fasta.getSequence(match.mQueryId, match.strand, qstart, qend) sbjct_sequence = sbjct_fasta.getSequence(match.mSbjctId, "+", tstart, tend) ndiscarded += 1 options.stdout.write(str(new) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: psl2chain.py 2901 2010-04-13 14:38:07Z andreas $", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # do sth ninput, nskipped, noutput = 0, 0, 0 for psl in Blat.iterator(options.stdin): ninput += 1 if psl.strand == "-": qstart, qend = psl.mQueryLength - \ psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom else: qstart, qend = psl.mQueryFrom, psl.mQueryTo options.stdout.write("chain %i %s %i %s %i %i %s %i %s %i %i %i\n" % (psl.mNMatches, psl.mSbjctId, psl.mSbjctLength, "+", psl.mSbjctFrom, psl.mSbjctTo, psl.mQueryId, psl.mQueryLength, psl.strand, qstart, qend, ninput)) size, tend, qend = 0, None, None for qstart, tstart, size in psl.getBlocks(): if tend is not None: options.stdout.write( "\t%i\t%i\n" % (tstart - tend, qstart - qend)) qend, tend = qstart + size, tstart + size options.stdout.write("%i" % (size,)) options.stdout.write("\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def pslComplementQuery(options): """complement psl entries. Fill the regions from a second psl file. """ Iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 border = options.complement_border min_length = options.complement_min_length while 1: match = next(iterator) if not match: break ninput += 1 if options.test and ninput >= options.test: break if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) if match.mNBlocks <= 1: nskipped += 1 continue pairs = [] for qstart, tstart, size in match.getBlocks(): qend = qstart + size - border qstart += border if qend - qstart < options.complement_min_length: continue tend = tstart + size - border tstart += border if tend - tstart < options.complement_min_length: continue ndiscarded += 1 options.stdout.write(str(new) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
def iterator_filter_overlapping_target( psls, options ): ninput, noutput, ndiscarded = 0, 0, 0 for block in Blat.iterator_target_overlap( psls, options.threshold_merge_distance ): l = len(block) ninput += l if l > 1: ndiscarded += l else: yield block[0] noutput += 1 E.info( "iterator_filter_overlapping_target: ninput=%i, noutput=%i, ndiscarded=%i" % (ninput, noutput,ndiscarded) )
def iterator_filter_overlapping_target(psls, options): ninput, noutput, ndiscarded = 0, 0, 0 for block in Blat.iterator_target_overlap( psls, options.threshold_merge_distance): l = len(block) ninput += l if l > 1: ndiscarded += l else: yield block[0] noutput += 1 E.info("iterator_filter_overlapping_target: ninput=%i, noutput=%i, " "ndiscarded=%i" % (ninput, noutput, ndiscarded))
def chunk_iterator_psl_overlap(infile, args, prefix, use_header=False): """iterate over overlapping entries in a psl file.""" iterator = Blat.BlatIterator(sys.stdin) processed_contigs = set() merge_distance = args[0] last_sbjct_id = None sbjct_end = 0 outfile = None filename = None while 1: match = next(iterator) if match is None: break if match.mSbjctId != last_sbjct_id or \ match.mSbjctFrom >= (sbjct_end + merge_distance): if last_sbjct_id: outfile.close() yield filename if last_sbjct_id != match.mSbjctId and \ match.mSbjctId in processed_contigs: raise ValueError("input not sorted correctly (contig,start): " "already encountered %s\n%s" % (match.mSbjctId, str(match))) last_sbjct_id = match.mSbjctId processed_contigs.add(last_sbjct_id) sbjct_start = match.mSbjctFrom sbjct_end = match.mSbjctTo if match.mSbjctFrom < sbjct_start: raise ValueError("input not sorted correctly (contig,start): " "%i < %i\n%s" % (match.mSbjctFrom, sbjct_start, str(match))) sbjct_end = max(match.mSbjctTo, sbjct_end) outfile.write(str(match) + "\n") if outfile: outfile.close() yield filename
def iterator_filter_overlapping_query(psls, options): '''remove alignments that overlap on query. If multiple alignments overlap, the one with the highest number of matching nucleotides is chosen. ''' # note: only takes the full ranges, but does not check for # individual overlap of blocks use connected components and # hasAlignmentOverlap ninput, noutput, ndiscarded = 0, 0, 0 last_contig = None for block in Blat.iterator_query_overlap( psls, options.threshold_merge_distance): # commented code is for base-level filtering, which is very slow # disabled for now # if block[0].mQueryId != last_contig: # last_contig = block[0].mQueryId # E.info( "processing %s" % last_contig ) l = len(block) ninput += l if l > 1: ndiscarded += l # components = Blat.getComponents( block, by_query = True ) # for component in components: # m = [ block[x] for x in component ] # m.sort( key = lambda x: -x.mNMatches ) # ndiscarded += len(m) - 1 # yield m[0] # noutput += 1 else: yield block[0] noutput += 1 E.info("iterator_filter_overlapping_query: ninput=%i, " "noutput=%i, ndiscarded=%i" % (ninput, noutput, ndiscarded))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() (options, args) = E.Start(parser) query_bitsets, target_bitsets = {}, {} def addRange(bitset, id, size, iterator): if id not in bitset: bitset[id] = bx.bitset.BinnedBitSet(size) b = bitset[id] for start, end in iterator: b.set_range(start, end - start) for psl in Blat.iterator(options.stdin): addRange(query_bitsets, psl.mQueryId, psl.mQueryLength, psl.iterator_query_exons()) addRange(target_bitsets, psl.mSbjctId, psl.mSbjctLength, psl.iterator_sbjct_exons()) def printBitset(outfile, bitsets): outfile.write("contig\tcovered\tsize\tpcovered\n") total, total_len = 0, 0 for chrom in sorted(bitsets): l = bitsets[chrom].size s = bitsets[chrom].count_range(0, l) if l > 0: outfile.write("%s\t%i\t%i\t%6.4f\n" % (chrom, s, l, 100.0 * s / l)) total += s total_len += l if total_len > 0: outfile.write("total\t%i\t%i\t%6.4f\n" % (total, total_len, 100.0 * total / total_len)) options.stdout.write("# query\n") printBitset(options.stdout, query_bitsets) options.stdout.write("# target\n") printBitset(options.stdout, target_bitsets) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--mask-lowercase", dest="mask_lowercase", action="store_true", help= "mask lowercase characters before computing properties [default=%default]" ) parser.add_option("--with-match", dest="with_match", action="store_true", help="echo the match in output [default=%default]") parser.add_option( "--without-match", dest="with_match", action="store_false", help="do not echo the match in output [default=%default]") parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("counts", "baseml", "match", "query-counts", "sbjct-counts"), help="methods to compute properties between sequence pairs.") WrapperCodeML.BaseML().AddOptions(parser) parser.set_defaults( methods=[], mask_lowercase=False, is_pslx=True, with_match=True, ) (options, args) = E.Start(parser) counters_plain = [] counters = [] for method in options.methods: if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa()) elif method == "query-counts": counters.append(QueriesCounter()) elif method == "sbjct-counts": counters.append(SbjctsCounter()) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML(options)) elif method == "match": counters_plain.append(CounterMatch(options)) if counters: iterator = Blat.iterator_pslx(options.stdin) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator(options.stdin) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write( "\t".join([ header, ] + ["\t".join(x.getHeaders()) for x in counters] + ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n") ninput, noutput, nskipped = 0, 0, 0 for match in iterator: ninput += 1 if options.with_match: options.stdout.write(str(match)) else: options.stdout.write(match.mQueryId) if counters: qseq = match.mQuerySequence sseq = match.mSbjctSequence # mask non printable characters - sometimes # appear after using pslToPslX qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq] sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq] if options.mask_lowercase: qseq = [re.sub("[a-z]", "N", x) for x in qseq] sseq = [re.sub("[a-z]", "N", x) for x in sseq] match.mQuerySequence = qseq match.mSbjctSequence = sseq qseq = "".join(match.mQuerySequence).upper() sseq = "".join(match.mSbjctSequence).upper() if len(qseq) != len(sseq): if options.loglevel >= 1: options.stdlog.write( "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match)) nskipped += 1 continue for counter in counters: counter(qseq, sseq) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters])) if counters_plain: for counter in counters_plain: counter(match) options.stdout.write( "\t" + "\t".join([str(counter) for counter in counters_plain])) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-t", "--tablename", dest="tablename", type="string", help= "tablename to get variants from (in samtools pileup format) [default=%default]." ) parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option( "-f", "--exons-file", dest="filename_exons", type="string", help= "filename with transcript model information (gtf formatted file) [default=%default]." ) parser.add_option( "-r", "--filename-reference", dest="filename_reference", type="string", help= "filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default]." ) parser.add_option( "--vcf-file", dest="filename_vcf", type="string", help= "filename with variants in VCF format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--pileup-file", dest="filename_pileup", type="string", help= "filename with variants in samtools pileup format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--vcf-sample", dest="vcf_sample", type="string", help= "sample id for species of interest in vcf formatted file [default=%default]." ) parser.add_option( "-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help= "filename of a list of transcript ids that are selenoproteins [default=%default]." ) parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option( "-k", "--with-knockouts", dest="with_knockouts", action="store_true", help= "add alleles that are knocked out to fasta and gtf files [default=%default]." ) parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite(options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF(options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join(("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# collected variants:", variants) # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# merged variants:", variants) # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences(indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print("exon", key) Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print("intron", key) Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles( transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip( allele.exon_starts[1:], allele.cds_starts[1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write(">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ("\t".join( (gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--filter-query", dest="filename_filter_query", type="string", help="filename with intervals in the query " "to filter (in gff format) [default=%default].") parser.add_option("--filter-target", dest="filename_filter_target", type="string", help="filename with intervals in the target to " "filter (in gff format) [default=%default].") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("map", "merge", "add-sequence", "complement", "select-query", "test", "filter-keep", "filter-remove", "rename-query", "sanitize", "filter-fasta", "remove-overlapping-query", "remove-overlapping-target"), help="""action to perform [default=%default].""") parser.add_option("--select", dest="select", type="choice", choices=("most-nmatches", "least-nmatches", "most-nmismatches", "least-nmismatches"), help="entry to select [default=%default].") parser.add_option("--header-names", dest="header", type="choice", choices=("none", "table", "full"), help="output psl header [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf"), help="format of intervals [default=%default].") parser.add_option("--queries-tsv-file", dest="filename_queries", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_sbjcts", type="string", help="fasta filename with sbjct [default=%default].") parser.add_option("--id-format", dest="id_format", type="string", help="format of new identifiers for the rename " "function [default=%default].") parser.add_option("--unique", dest="unique", action="store_true", help="in the rename function, make each match " "unique [default=%default].") parser.add_option("--output-filename-map", dest="output_filename_map", type="string", help="filename with map of old to new labels for " "rename function [default=%default].") parser.add_option("--complement-min-length", dest="complement_min_length", type="int", help="minimum length for complemented blocks " "[default=%default].") parser.add_option("--complement-border", dest="complement_border", type="int", help="number of residues to exclude before alignment " "at either end [default=%default].") parser.add_option("--complement-aligner", dest="complement_aligner", type="choice", choices=("clustal", "dba", "dialign", "dialign-lgs"), help="aligner for complemented segments " "[default=%default].") parser.add_option("--threshold-merge-distance", dest="threshold_merge_distance", type="int", help="distance in nucleotides at which two adjacent " "reads shall be merged even if they are not " "overlapping [%default].") parser.add_option("--test", dest="test", type="int", help="for debugging purposes - stop after x " "iterations [default=%default].") parser.set_defaults(filename_filter_target=None, filename_filter_query=None, filename_queries=None, filename_sbjcts=None, threshold_merge_distance=0, report_step=100000, min_aligned=100, methods=[], format="gff", select="most-nmatches", id_format="%06i", unique=False, output_filename_map=None, header=None, test=None) (options, args) = E.start(parser, add_pipe_options=True) if options.filename_queries: query_fasta = IndexedFasta.IndexedFasta(options.filename_queries) else: query_fasta = None if options.filename_sbjcts: sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts) else: sbjct_fasta = None if "add-sequence" in options.methods and \ (sbjct_fasta is None or query_fasta is None): raise ValueError( "please supply both indexed query and " "target/genome sequence data.") iterator = Blat.iterator(options.stdin) if options.header is not None or options.header != "none": if options.header == "table": options.stdout.write("\t".join(Blat.FIELDS) + "\n") elif options.header == "full": options.stdout.write(Blat.HEADER + "\n") for method in options.methods: if "map" == method: pslMap(options) break elif "filter-keep" == method: pslFilter(options, keep=True) break elif "filter-remove" == method: pslFilter(options, keep=False) break elif "merge" == method: pslMerge(options) break elif "add-sequence" == method: pslAddSequence(query_fasta, sbjct_fasta, options) break elif "complement" == method: pslComplement(query_fasta, sbjct_fasta, options) break elif "select-query" == method: pslSelectQuery(options) break elif "test" == method: iterator = Blat.iterator_test(iterator, options.report_step) elif "rename-query" == method: iterator = iterator_rename_query(iterator, options) elif "sanitize" == method: iterator = iterator_sanitize( iterator, query_fasta, sbjct_fasta, options) elif "filter-fasta" == method: iterator = iterator_filter_fasta( iterator, query_fasta, sbjct_fasta, options) elif "remove-overlapping-query" == method: iterator = iterator_filter_overlapping_query(iterator, options) elif "remove-overlapping-target" == method: iterator = iterator_filter_overlapping_target(iterator, options) for psl in iterator: options.stdout.write("%s\n" % str(psl)) E.stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: psl2stats.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.set_defaults( ) (options, args) = E.Start( parser ) query_bitsets, target_bitsets = {}, {} def addRange( bitset, id, size, iterator ): if id not in bitset: bitset[id] = bx.bitset.BinnedBitSet( size ) b = bitset[id] for start, end in iterator: b.set_range( start, end-start ) for psl in Blat.iterator( options.stdin ): addRange( query_bitsets, psl.mQueryId, psl.mQueryLength, psl.iterator_query_exons() ) addRange( target_bitsets, psl.mSbjctId, psl.mSbjctLength, psl.iterator_sbjct_exons() ) def printBitset( outfile, bitsets ): outfile.write( "contig\tcovered\tsize\tpcovered\n" ) total, total_len = 0, 0 for chrom in sorted(bitsets): l = bitsets[chrom].size s = bitsets[chrom].count_range( 0, l ) if l > 0: outfile.write( "%s\t%i\t%i\t%6.4f\n" % (chrom, s,l,100.0 * s / l) ) total += s total_len += l if total_len > 0: outfile.write("total\t%i\t%i\t%6.4f\n" % (total,total_len, 100.0 * total / total_len)) options.stdout.write("# query\n" ) printBitset( options.stdout, query_bitsets ) options.stdout.write("# target\n" ) printBitset( options.stdout, target_bitsets ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--query-psl-file", dest="filename_query", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_target", type="string", help="fasta filename with target.") parser.add_option( "-m", "--method", dest="method", type="choice", choices=("full", "pileup-query", "pileup-target", "gapless"), help="method to use for constructing the alignment [%default].") parser.add_option( "--forward-query", dest="forward_query", action="store_true", help= "reverse-complement sequences such that query is always on forward strand [%default]" ) parser.add_option("--target-prefix", dest="target_prefix", type="string", help="prefix to use for target [%default].") parser.add_option("--query-prefix", dest="query_prefix", type="string", help="prefix to use for query [%default].") parser.add_option("--id", dest="id", type="choice", choices=("numeric", "query"), help="choose type of identifier to use [%default]") parser.set_defaults( filename_query=None, filename_target=None, method="full", output_format_id="%06i", target_prefix="", query_prefix="", forward_query=False, ) (options, args) = E.Start(parser) if options.filename_query: query = IndexedFasta.IndexedFasta(options.filename_query) if options.filename_target: target = IndexedFasta.IndexedFasta(options.filename_target) if options.method == "full": getAlignment = getAlignmentFull id = 0 for match in Blat.iterator(options.stdin): if options.loglevel >= 2: options.stdout.write("# %s\n" % str(match)) m = match.getMapQuery2Target() m.moveAlignment(-min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts)) q = query.getSequence(match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo) t = target.getSequence(match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) query_ali, sbjct_ali = getAlignment(m, q, t, options) if match.strand == "-" and options.forward_query: query_ali = Genomics.complement(query_ali) sbjct_ali = Genomics.complement(sbjct_ali) options.stdout.write( ">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % (options.query_prefix, options.output_format_id % id, match.mQueryId, match.mQueryFrom, match.mQueryTo, query_ali, options.target_prefix, options.output_format_id % id, match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo, sbjct_ali)) id += 1 E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--query-psl-file", dest="filename_query", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_target", type="string", help="fasta filename with target.") parser.add_option("-m", "--method", dest="method", type="choice", choices=( "full", "pileup-query", "pileup-target", "gapless"), help="method to use for constructing the alignment [%default].") parser.add_option("--forward-query", dest="forward_query", action="store_true", help="reverse-complement sequences such that query is always on forward strand [%default]") parser.add_option("--target-prefix", dest="target_prefix", type="string", help="prefix to use for target [%default].") parser.add_option("--query-prefix", dest="query_prefix", type="string", help="prefix to use for query [%default].") parser.add_option("--id", dest="id", type="choice", choices=("numeric", "query"), help="choose type of identifier to use [%default]") parser.set_defaults( filename_query=None, filename_target=None, method="full", output_format_id="%06i", target_prefix="", query_prefix="", forward_query=False, ) (options, args) = E.Start(parser) if options.filename_query: query = IndexedFasta.IndexedFasta(options.filename_query) if options.filename_target: target = IndexedFasta.IndexedFasta(options.filename_target) if options.method == "full": getAlignment = getAlignmentFull id = 0 for match in Blat.iterator(options.stdin): if options.loglevel >= 2: options.stdout.write("# %s\n" % str(match)) m = match.getMapQuery2Target() m.moveAlignment(-min(match.mQueryBlockStarts), - min(match.mSbjctBlockStarts)) q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo) t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) query_ali, sbjct_ali = getAlignment(m, q, t, options) if match.strand == "-" and options.forward_query: query_ali = Genomics.complement(query_ali) sbjct_ali = Genomics.complement(sbjct_ali) options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % (options.query_prefix, options.output_format_id % id, match.mQueryId, match.mQueryFrom, match.mQueryTo, query_ali, options.target_prefix, options.output_format_id % id, match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo, sbjct_ali)) id += 1 E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: maf2psl.py 2879 2010-04-06 14:44:34Z andreas $", usage=globals()["__doc__"]) parser.add_option("-q", "--query", dest="query", type="string", help="sequence to use for query [default=%default].") parser.add_option("-t", "--target", dest="target", type="string", help="sequence to use for target [default=%default].") parser.set_defaults( query=None, target=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.query is None or options.target is None: if len(args) != 2: raise ValueError( "please supply two sequence identifiers for query and target") options.query, options.target = args # do sth ninput, nskipped, noutput = 0, 0, 0 reader = maf.Reader(options.stdin) psl = Blat.Match() for cc in threaditer(reader, (options.query, options.target)): ninput += 1 query, target = cc # treat identfiers like Hsap.GL000223.1 try: data = query.src.split(".") qs, qcontig = data[0], ".".join(data[1:]) except ValueError as msg: raise ValueError( "error: could not parse query %s: msg=%s" % (query.src, msg)) try: data = target.src.split(".") ts, tcontig = data[0], ".".join(data[1:]) except ValueError as msg: raise ValueError( "error: could not parse target %s: msg=%s" % (target.src, msg)) assert qs == options.query assert ts == options.target psl.mQueryId = qcontig psl.mSbjctId = tcontig psl.fromPair(query.start, query.src_size, query.strand, query.text.upper(), target.start, target.src_size, target.strand, target.text.upper()) E.debug("%s\t%s\t%i\t%i\t%s\t%s" % (qs, qcontig, query.start, query.src_size, query.strand, query.text)) E.debug("%s\t%s\t%i\t%i\t%s\t%s" % (ts, tcontig, target.start, target.src_size, target.strand, target.text)) options.stdout.write("%s\n" % str(psl)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: psl2map.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string", help="fasta filename with queries - required for polyA analysis [%default].") parser.add_option("--polyA", dest="polyA", action="store_true", help="detect polyA tails [%default].") parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="OUTPUT filename with histogram information on aggregate coverages [%default].") parser.add_option("--output-filename-empty", dest="output_filename_empty", type="string", help="OUTPUT filename with queries for which all matches have been discarded [%default].") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("map", "psl"), help="output format to choose [%default].") parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true", help="input is zipped.") parser.add_option("--threshold-min-pid", dest="threshold_min_pid", type="float", help="minimum thresholds for pid [%default].") parser.add_option("--threshold-min-matches", dest="threshold_min_matches", type="int", help="minimum threshold for number of matching residues [%default].") parser.add_option("--threshold-max-error-rate", dest="threshold_max_error_rate", type="float", help="maximum threshold for error of aligned part [%default].") parser.add_option("--threshold-good-query-coverage", dest="threshold_good_query_coverage", type="float", help="minimum query coverage for segments to be counted as good [%default].") parser.add_option("--threshold-min-query-coverage", dest="threshold_min_query_coverage", type="float", help="minimum query coverage for segments to be accepted [%default].") parser.add_option("--threshold-max-query-gapchars", dest="threshold_max_query_gapchars", type="int", help="maximum number of gap characters in query[%default].") parser.add_option("--threshold-max-query-gaps", dest="threshold_max_query_gaps", type="int", help="maximum number of gaps in query[%default].") parser.add_option("--threshold-max-sbjct-gapchars", dest="threshold_max_sbjct_gapchars", type="int", help="maximum number of gap characters in sbjct[%default].") parser.add_option("--keep-unique-matches", dest="keep_unique_matches", action="store_true", help="ignore filters for unique matches [%default].") parser.add_option("--keep-all-best", dest="keep_all_best", action="store_true", help="when sorting matches, keep all matches within the collection threshold [%default].") parser.add_option("--output-best-per-subject", dest="best_per_sbjct", action="store_true", help="keep only the best entry per sbjct (for transcript mapping) [%default].") parser.add_option("--threshold-max-sbjct-gaps", dest="threshold_max_sbjct_gaps", type="int", help="maximum number of gaps in sbjct[%default].") parser.add_option("--test", dest="test", type="int", help="test - stop after # rows of parsing[%default].") parser.add_option("-m", "--matching-mode", dest="matching_mode", type="choice", choices=("best-coverage", "best-query-coverage", "best-sbjct-coverage", "best-pid", "best-covpid", "best-query-covpid", "best-sbjct-covpid", "best-min-covpid", "best-query-min-covpid", "best-sbjct-min-covpid", "unique", "all"), help="determines how to selecte the best match [%default].") parser.add_option("--subjctfilter-tsv-file", dest="filename_filter_sbjct", type="string", help="gff file for filtering sbjct matches. Matches overlapping these regions are discarded, but see --keep-forbidden [%default].") parser.add_option("--keep-forbidden", dest="keep_forbidden", action="store_true", help="if set, keep only matches that overlap the regions supplied with --subjctfilter-tsv-file [%default].") parser.add_option("--query-forward-coordinates", dest="query_forward_coordinates", action="store_true", help="use forward coordinates for query, strand will refer to sbjct [%default].") parser.add_option("--ignore-all-random", dest="ignore_all_random", action="store_true", help="if there are multiple best matches, ignore all those to chrUn and _random [%default].") parser.add_option("--collection-threshold", dest="collection_threshold", type="float", help="threshold for collecting matches, percent of best score [%default].") parser.add_option("--collection-distance", dest="collection_distance", type="float", help="threshold for collecting matches, difference to best score [%default].") parser.set_defaults(input_filename_domains=None, input_filename_queries=None, threshold_good_query_coverage=90.0, threshold_min_pid=30.0, threshold_min_matches=0, threshold_max_error_rate=None, output_filename_pattern="%s", keep_unique_matches=False, output_format="map", print_matched=["full", "partial", "good"], from_zipped=False, combine_overlaps=True, min_length_domain=30, threshold_min_query_coverage=50, min_length_singletons=30, new_family_id=10000000, add_singletons=False, matching_mode="best-coverage", best_per_sbjct=False, threshold_max_query_gapchars=None, threshold_max_query_gaps=None, threshold_max_sbjct_gapchars=None, threshold_max_sbjct_gaps=None, filename_filter_sbjct=None, keep_forbidden=False, keep_all_best=False, test=None, query_forward_coordinates=False, output_filename_empty=None, collection_threshold=1.0, collection_distance=0, polyA=False, # max residues missing from non polyA end polyA_max_unaligned=3, # min residues in tail polyA_min_unaligned=10, # min percent residues that are A/T in tail polyA_min_percent=70.0, # ignore duplicate matches if they are on Un or # _random ignore_all_random=False, ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) == 1: if options.from_zipped or args[0][-3:] == ".gz": import gzip infile = gzip.open(args[0], "r") else: infile = IOTools.openFile(args[0], "r") else: infile = sys.stdin if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None if options.filename_filter_sbjct: try: import bx.intervals.intersection except ImportError: raise ValueError("filtering for intervals requires the bx tools") intervals = GTF.readGFFFromFileAsIntervals( IOTools.openFile(options.filename_filter_sbjct, "r")) intersectors = {} for contig, values in list(intervals.items()): intersector = bx.intervals.intersection.Intersecter() for start, end in values: intersector.add_interval(bx.intervals.Interval(start, end)) intersectors[contig] = intersector if options.loglevel >= 1: options.stdlog.write("# read %i intervals for %i contigs.\n" % (sum([len(x) for x in list(intervals.values())]), len(intersectors))) else: intersectors = None ################################################ ################################################ ################################################ # processing of a chunk (matches of same query) ################################################ ninput, noutput, nskipped = 0, 0, 0 # number of sequences with full/partial/good matches nfull_matches, npartial_matches, ngood_matches = 0, 0, 0 # number of sequences which are fully/good/partially matched # i.e., after combining all aligned regions nfully_matched, npartially_matched, nwell_matched = 0, 0, 0 nremoved_pid, nremoved_query_coverage, nempty = 0, 0, 0 nremoved_gaps, nremoved_nmatches = 0, 0 nremoved_regions = 0 nqueries_removed_region = 0 aggregate_coverages = [] mapped_coverages = [] fully_matched = [] well_matched = [] partially_matched = [] new_family_id = options.new_family_id if options.output_filename_empty: outfile_empty = IOTools.openFile(options.output_filename_empty, "w") outfile_empty.write("read_id\tcomment\n") else: outfile_empty = None if options.polyA: options.outfile_polyA = IOTools.openFile( options.output_filename_pattern % "polyA", "w") options.outfile_polyA.write("query_id\tstart\tend\tpA+N\tpT+N\ttail\n") def processChunk(query_id, matches): """process a set of matches from query_id""" global ninput, noutput, nskipped global nfull_matches, npartial_matches, ngood_matches global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches global nremoved_regions, nqueries_removed_region global outfile_empty ninput += 1 full_matches = [] good_matches = [] partial_matches = [] x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0 nmatches = len(matches) new_matches = [] # absolute filters applicable to non-fragmentory matches for match in matches: if match.mPid < options.threshold_min_pid: nremoved_pid += 1 continue if match.mNMatches < options.threshold_min_matches: nremoved_nmatches += 1 continue if options.threshold_max_error_rate: r = 100.0 * \ math.power( options.threshold_max_error_rate, match.mNMatches + match.mNMismatches) if match.mPid < r: nremoved_pid += 1 x_nremoved_pid += 1 continue new_matches.append(match) matches = new_matches # filter matches if len(matches) == 0: if outfile_empty: outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" % (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches)) nskipped += 1 return if options.keep_unique_matches and len(matches) == 1: pass else: new_matches = [] for match in matches: if match.mQueryCoverage < options.threshold_min_query_coverage: nremoved_query_coverage += 1 x_nquery_coverage += 1 continue if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue new_matches.append(match) matches = new_matches if len(matches) == 0: if outfile_empty: outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" % (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches)) nskipped += 1 return # Remove queries matching to a forbidden region. This section # will remove the full query if any of its matches matches in a # forbidden region. keep = True for match in matches: if intersectors and match.mSbjctId in intersectors: found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo) if found and not options.keep_forbidden or (found and not options.keep_forbidden): nremoved_regions += 1 keep = False continue if not keep: nqueries_removed_region += 1 if outfile_empty: outfile_empty.write( "%s\toverlap with forbidden region\n" % query_id) return # check for full length matches for match in matches: if match.mQueryCoverage >= 99.9: full_matches.append(match) if match.mQueryCoverage > options.threshold_good_query_coverage: good_matches.append(match) else: partial_matches.append(match) if full_matches: nfull_matches += 1 elif good_matches: ngood_matches += 1 elif partial_matches: npartial_matches += 1 # compute coverage of sequence with matches intervals = [] for match in full_matches + good_matches + partial_matches: intervals.append((match.mQueryFrom, match.mQueryTo)) rest = Intervals.complement(intervals, 0, match.mQueryLength) query_coverage = 100.0 * \ (match.mQueryLength - sum([x[1] - x[0] for x in rest])) / match.mQueryLength if query_coverage >= 99.9: fully_matched.append(query_id) elif query_coverage > options.threshold_good_query_coverage: well_matched.append(query_id) else: partially_matched.append(query_id) aggregate_coverages.append(query_coverage) # select matches to output matches, msg = selectMatches(query_id, matches, options, queries_fasta) if len(matches) > 0: for match in matches: if options.query_forward_coordinates: match.convertCoordinates() if options.output_format == "map": options.stdout.write("%s\n" % "\t".join(map(str, ( match.mQueryId, match.mSbjctId, match.strand, "%5.2f" % match.mQueryCoverage, "%5.2f" % match.mSbjctCoverage, "%5.2f" % match.mPid, match.mQueryLength, match.mSbjctLength, match.mQueryFrom, match.mQueryTo, match.mSbjctFrom, match.mSbjctTo, ",".join( map(str, match.mBlockSizes)), ",".join( map(str, match.mQueryBlockStarts)), ",".join( map(str, match.mSbjctBlockStarts)), )))) elif options.output_format == "psl": options.stdout.write(str(match) + "\n") noutput += 1 else: if outfile_empty: outfile_empty.write( "%s\tno matches selected: %s\n" % (query_id, msg)) nempty += 1 if options.output_format == "map": options.stdout.write("\t".join(("query_id", "sbjct_id", "sstrand", "qcoverage", "scoverage", "pid", "qlen", "slen", "qfrom", "qto", "sfrom", "sto", "blocks", "qstarts", "sstarts")) + "\n") elif options.output_format == "psl": options.stdout.write(Blat.Match().getHeader() + "\n") ################################################ ################################################ ################################################ # main loop ################################################ nfully_covered = None matches = [] last_query_id = None is_complete = True ninput_lines = 0 skip = 0 iterator = Blat.BlatIterator(infile) while 1: try: match = next(iterator) except Blat.ParsingError: iterator = Blat.BlatIterator(infile) continue if match is None: break ninput_lines += 1 if options.test and ninput_lines > options.test: break if match.mQueryId != last_query_id: if last_query_id: processChunk(last_query_id, matches) matches = [] last_query_id = match.mQueryId matches.append(match) processChunk(last_query_id, matches) printHistogram(aggregate_coverages, "aggregate", options) printHistogram(mapped_coverages, "mapped", options) if "full" in options.print_matched: printMatched(fully_matched, "full", options) if "good" in options.print_matched: printMatched(well_matched, "good", options) if "partial" in options.print_matched: printMatched(partially_matched, "partial", options) if options.loglevel >= 1: options.stdlog.write( "# alignments: ninput=%i, is_complete=%s\n" % (ninput_lines, str(is_complete))) options.stdlog.write( "# queries: ninput=%i, noutput=%i\n" % (ninput, noutput)) options.stdlog.write("# individual coverage: full=%i, good=%i, partial=%i\n" % ( nfull_matches, ngood_matches, npartial_matches)) options.stdlog.write("# aggregate coverage: full=%i, good=%i, partial=%i\n" % ( len(fully_matched), len(well_matched), len(partially_matched))) options.stdlog.write("# omitted queries: total=%i, thresholds=%i, regions=%i, selection=%i\n" % (nskipped + nqueries_removed_region + nempty, nskipped, nqueries_removed_region, nempty)) options.stdlog.write("# omitted matches: pid=%i, query_coverage=%i, gaps=%i, regions=%i, nmatches=%i\n" % ( nremoved_pid, nremoved_query_coverage, nremoved_gaps, nremoved_regions, nremoved_nmatches)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: psl2table.py 2891 2010-04-07 08:59:18Z andreas $", usage=globals()["__doc__"]) parser.add_option("--mask-lowercase", dest="mask_lowercase", action="store_true", help="mask lowercase characters before computing properties [default=%default]") parser.add_option("--with-match", dest="with_match", action="store_true", help="echo the match in output [default=%default]") parser.add_option("--without-match", dest="with_match", action="store_false", help="do not echo the match in output [default=%default]") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "counts", "baseml", "match", "query-counts", "sbjct-counts"), help="methods to compute properties between sequence pairs.") WrapperCodeML.BaseML().AddOptions(parser) parser.set_defaults( methods=[], mask_lowercase=False, is_pslx=True, with_match=True, ) (options, args) = E.Start(parser) counters_plain = [] counters = [] for method in options.methods: if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa()) elif method == "query-counts": counters.append(QueriesCounter()) elif method == "sbjct-counts": counters.append(SbjctsCounter()) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML(options)) elif method == "match": counters_plain.append(CounterMatch(options)) if counters: iterator = Blat.iterator_pslx(options.stdin) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator(options.stdin) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write("\t".join( [header, ] + ["\t".join(x.getHeaders()) for x in counters] + ["\t".join(x.getHeaders()) for x in counters_plain]) + "\n") ninput, noutput, nskipped = 0, 0, 0 for match in iterator: ninput += 1 if options.with_match: options.stdout.write(str(match)) else: options.stdout.write(match.mQueryId) if counters: qseq = match.mQuerySequence sseq = match.mSbjctSequence # mask non printable characters - sometimes # appear after using pslToPslX qseq = [re.sub("[^a-zA-Z]", "N", x) for x in qseq] sseq = [re.sub("[^a-zA-Z]", "N", x) for x in sseq] if options.mask_lowercase: qseq = [re.sub("[a-z]", "N", x) for x in qseq] sseq = [re.sub("[a-z]", "N", x) for x in sseq] match.mQuerySequence = qseq match.mSbjctSequence = sseq qseq = "".join(match.mQuerySequence).upper() sseq = "".join(match.mSbjctSequence).upper() if len(qseq) != len(sseq): if options.loglevel >= 1: options.stdlog.write( "# WARNING: two sequences of unequal length in match\n# %s\n" % str(match)) nskipped += 1 continue for counter in counters: counter(qseq, sseq) options.stdout.write("\t" + "\t".join( [str(counter) for counter in counters])) if counters_plain: for counter in counters_plain: counter(match) options.stdout.write("\t" + "\t".join( [str(counter) for counter in counters_plain])) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: psl2wiggle_stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--wiggle-files", dest="wiggle_files", type="string", help="glob expression for wiggle files [%default].") parser.add_option("--prefix", dest="prefix", type="string", help="prefix to add to contig names before lookup [%default].") parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true", help="input is zipped.") parser.add_option("--test", dest="test", type="int", help="test - stop after # rows of parsing [%default].") parser.add_option("--with-values", dest="with_values", action="store_true", help="output values in last column [%default].") parser.set_defaults(wiggle_files="*.data.bz2", from_zipped=False, prefix="", with_values=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) # open indexed access to wiggles wiggle_files = glob.glob(options.wiggle_files) if not wiggle_files: raise IOError("could not find wiggle files with '%s'" % options.wiggle_files) index = Wiggle.WiggleMultiIndexedAccess(wiggle_files, keep_open=True, use_cache=False) iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tnali\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders()))) if options.with_values: options.stdout.write("\tvalues") options.stdout.write("\n") while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 if options.loglevel >= 2: options.stdlog.write(str(match) + "\n") # psl always matches on the forward strand map_genome2query = alignlib_lite.py_makeAlignmentBlocks() f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % ( match.mSbjctFrom, match.mSbjctTo, match.mQueryFrom, match.mQueryTo, match.mSbjctBlockStarts, match.mQueryBlockStarts, match.mBlockSizes)) f.copy(map_genome2query) data = index.get(options.prefix + match.mSbjctId, match.mSbjctFrom, match.mSbjctTo) values = [] for x, vv in data: for v in vv: if map_genome2query.mapRowToCol(x) >= 0: values.append(v) x += 1 if len(values) == 0: nskipped += 1 continue noutput += 1 if options.loglevel >= 2: options.stdlog.write( "# %s\n" % ",".join(["%5.3f" % v for v in values])) s = Stats.DistributionalParameters(values) options.stdout.write("%s\t%i\t%s" % (match.mQueryId, match.mNMismatches + match.mNMatches, str(s))) if options.with_values: options.stdout.write( "\t%s" % ",".join(["%5.3f" % v for v in values])) options.stdout.write("\n") if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $", usage=globals()["__doc__"]) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ## do sth ninput, nskipped, noutput = 0, 0, 0 psl = None def chain_iterator(infile): lines = [] for line in options.stdin: if line.startswith("#"): continue if line.strip() == "": continue if line.startswith("chain"): if lines: yield lines lines = [] lines.append(line) yield lines for lines in chain_iterator(options.stdin): ninput += 1 psl = Blat.Match() (_, _, psl.mSbjctId, target_length, target_strand, target_start, target_end, psl.mQueryId, query_length, query_strand, query_start, query_end, alignment_id) = lines[0][:-1].split() ( psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength, psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength ) = \ [ int(x) for x in (query_start, query_end, query_length, target_start, target_end, target_length) ] map_query2target = alignlib_lite.py_makeAlignmentBlocks() qstart, tstart = psl.mQueryStart, psl.mSbjctStart for line in lines[1:-1]: size, dt, dq = [int(x) for x in line[:-1].split()] map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart) qstart += size + dq tstart += size + dt size = int(lines[-1][:-1]) map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart) psl.fromMap(map_query2target) # sort out strand # target_strand is always positive assert (target_strand == "+") # if query strand is negative if query_strand == "-": # invert both query and target psl.switchTargetStrand() # manually invert the query coordinates psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom options.stdout.write("%s\n" % psl) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--filter-query", dest="filename_filter_query", type="string", help="filename with intervals in the query " "to filter (in gff format) [default=%default].") parser.add_option("--filter-target", dest="filename_filter_target", type="string", help="filename with intervals in the target to " "filter (in gff format) [default=%default].") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("map", "merge", "add-sequence", "complement", "select-query", "test", "filter-keep", "filter-remove", "rename-query", "sanitize", "filter-fasta", "remove-overlapping-query", "remove-overlapping-target"), help="""action to perform [default=%default].""") parser.add_option("--select", dest="select", type="choice", choices=("most-nmatches", "least-nmatches", "most-nmismatches", "least-nmismatches"), help="entry to select [default=%default].") parser.add_option("--header-names", dest="header", type="choice", choices=("none", "table", "full"), help="output psl header [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf"), help="format of intervals [default=%default].") parser.add_option("--queries-tsv-file", dest="filename_queries", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_sbjcts", type="string", help="fasta filename with sbjct [default=%default].") parser.add_option("--id-format", dest="id_format", type="string", help="format of new identifiers for the rename " "function [default=%default].") parser.add_option("--unique", dest="unique", action="store_true", help="in the rename function, make each match " "unique [default=%default].") parser.add_option("--output-filename-map", dest="output_filename_map", type="string", help="filename with map of old to new labels for " "rename function [default=%default].") parser.add_option("--complement-min-length", dest="complement_min_length", type="int", help="minimum length for complemented blocks " "[default=%default].") parser.add_option("--complement-border", dest="complement_border", type="int", help="number of residues to exclude before alignment " "at either end [default=%default].") parser.add_option("--complement-aligner", dest="complement_aligner", type="choice", choices=("clustal", "dba", "dialign", "dialign-lgs"), help="aligner for complemented segments " "[default=%default].") parser.add_option("--threshold-merge-distance", dest="threshold_merge_distance", type="int", help="distance in nucleotides at which two adjacent " "reads shall be merged even if they are not " "overlapping [%default].") parser.add_option("--test", dest="test", type="int", help="for debugging purposes - stop after x " "iterations [default=%default].") parser.set_defaults(filename_filter_target=None, filename_filter_query=None, filename_queries=None, filename_sbjcts=None, threshold_merge_distance=0, report_step=100000, min_aligned=100, methods=[], format="gff", select="most-nmatches", id_format="%06i", unique=False, output_filename_map=None, header=None, test=None) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_queries: query_fasta = IndexedFasta.IndexedFasta(options.filename_queries) else: query_fasta = None if options.filename_sbjcts: sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts) else: sbjct_fasta = None if "add-sequence" in options.methods and \ (sbjct_fasta is None or query_fasta is None): raise ValueError( "please supply both indexed query and " "target/genome sequence data.") iterator = Blat.iterator(options.stdin) if options.header is not None or options.header != "none": if options.header == "table": options.stdout.write("\t".join(Blat.FIELDS) + "\n") elif options.header == "full": options.stdout.write(Blat.HEADER + "\n") for method in options.methods: if "map" == method: pslMap(options) break elif "filter-keep" == method: pslFilter(options, keep=True) break elif "filter-remove" == method: pslFilter(options, keep=False) break elif "merge" == method: pslMerge(options) break elif "add-sequence" == method: pslAddSequence(query_fasta, sbjct_fasta, options) break elif "complement" == method: pslComplement(query_fasta, sbjct_fasta, options) break elif "select-query" == method: pslSelectQuery(options) break elif "test" == method: iterator = Blat.iterator_test(iterator, options.report_step) elif "rename-query" == method: iterator = iterator_rename_query(iterator, options) elif "sanitize" == method: iterator = iterator_sanitize( iterator, query_fasta, sbjct_fasta, options) elif "filter-fasta" == method: iterator = iterator_filter_fasta( iterator, query_fasta, sbjct_fasta, options) elif "remove-overlapping-query" == method: iterator = iterator_filter_overlapping_query(iterator, options) elif "remove-overlapping-target" == method: iterator = iterator_filter_overlapping_target(iterator, options) for psl in iterator: options.stdout.write("%s\n" % str(psl)) E.Stop()
def pslMerge(options): """merge psl alignments. """ iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, ndiscarded, nskipped = 0, 0, 0, 0 last_query = None last_target = None last_strand = None def process(matches): new = matches[0].copy() map_query2target = alignlib_lite.py_makeAlignmentBlocks() graph = networkx.DiGraph() graph.add_nodes_from(range(len(matches) + 2)) matches.sort(key=lambda x: x.mQueryFrom) if Genomics.IsPositiveStrand(matches[0].strand): f = lambda x, y: x.mSbjctTo < y.mSbjctFrom else: f = lambda x, y: x.mSbjctFrom > y.mSbjctTo for x in range(0, len(matches)): xx = matches[x] if options.loglevel >= 6: options.stdlog.write("# graph: %2i %s\n" % (x, str(xx))) for y in range(x + 1, len(matches)): yy = matches[y] d = min(xx.mQueryTo, yy.mQueryTo) - \ max(xx.mQueryFrom, yy.mQueryFrom) if d > 0 or not f(xx, yy): continue else: graph.add_edge(x, y, {'weight': -d}) source = len(matches) target = len(matches) + 1 for x in range(len(matches)): xx = matches[x] graph.add_edge(source, x, {'weight': xx.mQueryFrom}) graph.add_edge( x, target, {'weight': xx.mQueryLength - xx.mQueryTo}) if options.loglevel >= 6: networkx.write_edgelist(graph, options.stdlog) path = networkx.dijkstra_path(graph, source, target) if options.loglevel >= 6: options.stdlog.write("# path: %s\n" % (str(path))) new_matches = [matches[x] for x in path[1:-1]] if len(matches) != len(new_matches): E.warn(("query=%s, target=%s, strand=%s: " "removed overlapping/out-of-order segments: " "before=%i, after=%i") % (matches[0].mQueryId, matches[0].mSbjctId, matches[0].strand, len(matches), len(new_matches))) matches = new_matches for match in matches: m = match.getMapQuery2Target() alignlib_lite.py_addAlignment2Alignment(map_query2target, m) new.fromMap(map_query2target, use_strand=True) options.stdout.write(str(new) + "\n") options.stdout.flush() return 1 while 1: match = next(iterator) if not match: break ninput += 1 if options.test and ninput >= options.test: break if options.loglevel >= 10: options.stdlog.write("# input: %s\n" % (str(match))) if ninput % options.report_step == 0: E.info("progress: ninput=%i, noutput=%i" % (ninput, noutput)) if match.mQueryId != last_query or\ match.strand != last_strand or\ match.mSbjctId != last_target: if last_query: noutput += process(matches) matches = [] last_query, last_target, last_strand = ( match.mQueryId, match.mSbjctId, match.strand) matches.append(match) if last_query: noutput += process(matches) E.info("ninput=%i, noutput=%i, nskipped=%i, ndiscarded=%i" % (ninput, noutput, nskipped, ndiscarded))
forward_query = False, ) (options, args) = E.Start( parser ) if options.filename_query: query = IndexedFasta.IndexedFasta( options.filename_query ) if options.filename_target: target = IndexedFasta.IndexedFasta( options.filename_target ) if options.method == "full": getAlignment = getAlignmentFull id = 0 for match in Blat.iterator( options.stdin ): if options.loglevel >= 2: options.stdout.write("# %s\n" % str(match)) m = match.getMapQuery2Target() m.moveAlignment( -min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts) ) q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo ) t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo ) query_ali, sbjct_ali = getAlignment( m, q, t, options ) if match.strand == "-" and options.forward_query: query_ali = Genomics.complement( query_ali ) sbjct_ali = Genomics.complement( sbjct_ali ) options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \ (options.query_prefix,
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: maq2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option("-c", "--filename-coordinates", dest="filename_coordinates", type="string", help="filename with coordinates." ) parser.add_option( "-p", "--output-filename-pattern", dest="output_filename_pattern", type="string" , help="OUTPUT filename pattern for additional data [%default].") parser.set_defaults( genome_file = "genome", filename_coordinates = None, segment_length = 32, ) (options, args) = E.Start( parser ) if options.genome_file: genome = IndexedFasta.IndexedFasta( options.genome_file ) else: genome = None ninput, noutput = 0, 0 if options.filename_coordinates: segment_length = options.segment_length a = matchby_sequence( iterator_segments( open( options.filename_coordinates, "r"), options.segment_length ), Maq.iterator( options.stdin ), lambda x: (x.mSegment), lambda x: (x.contig) ) for segments, maqs in a: pairs = match_smaller( segments, maqs, lambda x: x.start, lambda x: x.start ) for segment, maq in pairs: ninput += 1 assert maq.start >= segment.start, "maq start < segment start: %i < %i" % (maq.start, segment.start) assert maq.start + maq.mLength <= segment.start + 2 * segment_length, "maq end > segment end: %i < %i" % (maq.start + maq.mLength, segment.start + 2 * segment_length) psl = Blat.Match() psl.fromMaq( maq ) match_start = maq.start segment_start = segment.start contig, left_start, right_start = segment.contig, segment.mLeftStart, segment.mRightStart if options.loglevel >= 2: options.stdlog.write("# mapping: name=%s, match_start=%i, segment=%s\n" % (maq.contig, match_start, str(segment))) # build positions of the two blocks left_size = segment_length - (match_start - segment_start) right_size = segment_length - left_size mapped1_start = left_start + match_start - segment_start mapped1_end = left_start + segment_length mapped2_start = right_start mapped2_end = right_start + right_size if options.loglevel >= 3: options.stdlog.write("# mapped: match_start=%i, segment_start=%i, left_size=%i, right_size=%i, mapped1=(%i-%i), mapped2=(%i-%i)\n" %\ (match_start, segment_start, left_size, right_size, mapped1_start, mapped1_end, mapped2_start, mapped2_end) ) psl.mSbjctId = contig if genome: psl.mSbjctLength = genome.getLength( contig ) psl.mSbjctFrom = mapped1_start psl.mSbjctTo = mapped2_end psl.mNBlocks = 2 psl.mBlockSizes= [left_size, right_size] psl.mQueryBlockStarts = [0, left_size] psl.mSbjctBlockStarts = [mapped1_start, mapped2_start] psl.mSbjctNGapsCounts = 1 psl.mSbjctNGapsBases = mapped2_start - mapped1_end options.stdout.write( str(psl) + "\n" ) noutput += 1 else: for maq in Maq.iterator( options.stdin ): ninput += 1 psl = Blat.Match() psl.fromMaq( maq ) options.stdout.write( str(psl) + "\n" ) noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i\n" % (ninput, noutput) ) E.Stop()
def main(): parser = E.OptionParser( version = "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.add_option("--random-proportion", dest="random_proportion", type="float", help="mask randomly columns in multiple alignments [default=%default]" ) parser.add_option("--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]" ) parser.set_defaults( quality_threshold = 40, quality_file = "quality", filename_map = None, frame = 3, ) (options, args) = E.Start( parser ) ################################################## ################################################## ################################################## ## read map ################################################## infile = open(options.filename_map) map_genes2genome = {} for match in Blat.iterator( infile ): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## ## get quality scores ################################################## quality = IndexedFasta.IndexedFasta( options.quality_file ) quality.setTranslator( IndexedFasta.TranslatorBytes() ) ################################################## ################################################## ################################################## ## main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write( "cluster_id\tstart\tend\n" ) for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn( "gene_id %s not found in map." % gene_id ) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib_lite.py_makeAlignmentVector() fillAlignment( map_gene2mali, alignment ) # get quality scores quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR ) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp,c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue positions.append( y ) scores = [ quality_scores[ x ] for x in positions ] random.shuffle(scores) for p,q in zip( positions,scores): quality_scores[p] = q # negative strand to_mask = [] ## reverse position rp = len(alignment) for fp,c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \ (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) ) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend( list( range(start, start + options.frame) ) ) else: to_mask.append( p ) regions = Iterators.group_by_distance( sorted(to_mask) ) for start,end in regions: options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) ) noutput += 1 E.info( "ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed) ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: bed2psl.py 2899 2010-04-13 14:37:37Z andreas $", usage=globals()["__doc__"]) parser.add_option("-q", "--query", dest="query", type="string", help="sequence to use for query [default=%default].") parser.add_option("-t", "--target", dest="target", type="string", help="sequence to use for target [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.set_defaults( genome_file=None, query=None, target=None, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ## do sth ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None psl = Blat.Match() for bed in Bed.iterator(options.stdin): ninput += 1 start, end = bed.start, bed.end if "blockSizes" in bed: psl.mQueryId = bed["name"] blocksizes = [int(x) for x in bed["blockSizes"].split(",")[:-1]] sbjctblockstarts = [ int(x) + start for x in bed["blockStarts"].split(",")[:-1] ] strand = bed["strand"] else: psl.mQueryId = "%i" % ninput blocksizes = [end - start] sbjctblockstarts = [ start, ] strand = "+" psl.mSbjctId = bed.contig psl.mSbjctFrom, psl.mSbjctTo = start, end psl.mQueryFrom, psl.mQueryTo = 0, end - start psl.mBlockSizes = blocksizes psl.mNBlocks = len(blocksizes) psl.strand = strand q, qp = [], 0 for x in blocksizes: q.append(qp) qp += x psl.mQueryBlockStarts = q psl.mSbjctBlockStarts = sbjctblockstarts psl.mQueryLength = sum(psl.mBlockSizes) if fasta: psl.mSbjctLength = fasta.getLength(bed.contig) options.stdout.write("%s\n" % str(psl)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) ## write footer and output benchmark information. E.Stop()
def main(): parser = E.OptionParser( version= "%prog version: $Id: psl2gff.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option( "-s", "--filename-strand", dest="filename_strand", type="string", help="set strand information according to file [default=%DEFAULT].") parser.set_defaults(as_gtf=False, filename_strand=None, test=None) (options, args) = E.Start(parser, add_pipe_options=True) #################################### if options.filename_strand: map_id2strand = IOTools.readMap(open(options.filename_strand, "r")) else: map_id2strand = {} iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 if options.as_gtf: gff = GTF.Entry() else: gff = GTF.Entry() gff.source = "psl" gff.feature = "exon" ids = {} while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 if match.mQueryId not in ids: ids[match.mQueryId] = 1 id = match.mQueryId else: id = match.mQueryId + ":%i" % ids[match.mQueryId] ids[match.mQueryId] += 1 if options.as_gtf: gff.contig = match.mSbjctId gff.gene_id = id gff.transcript_id = id else: gff.contig = match.mSbjctId gff.clearAttributes() gff.addAttribute("gene_id", id) if id in map_id2strand: gff.strand = map_id2strand[id] else: gff.strand = match.strand for qstart, sstart, size in match.getBlocks(): gff.start = sstart gff.end = sstart + size options.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=sys.argv): parser = E.OptionParser( version="%prog version: $Id: psl2wiggle.py 2834 2009-11-24 16:11:23Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-b", "--output-filename-pattern", dest="output_filename", type="string", help="filename for output [default=%default]") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig"), help="output format [default=%default]") parser.set_defaults(genome_file=None, typecode=numpy.int16, output_filename=None, output_format="wiggle", test=None) (options, args) = E.Start(parser, add_pipe_options=True) typecode = options.typecode if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) counts = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * typecode().itemsize)) for contig, size in contig_sizes.items(): E.debug("allocating %s: %i bases" % (contig, size)) counts[contig] = numpy.zeros(size, typecode) E.info("allocated memory for %i contigs" % len(fasta)) else: fasta = None contig_sizes = {} if options.output_format in ("bigwig", "bigbed"): if not options.genome_file: raise ValueError( "please supply genome file for bigwig/bigbed computation.") if not options.output_filename: raise ValueError( "please output file for bigwig/bigbed computation.") if options.output_format == "bigwig": executable_name = "wigToBigWig" elif options.output_format == "bigbed": executable_name = "bedToBigBed" else: raise ValueError("unknown output format `%s`" % options.output_format) executable = IOTools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # write contig sizes outfile_size = open(tmpfile_sizes, "w") for contig, size in contig_sizes.items(): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() outfile = open(tmpfile_wig, "w") else: outfile = options.stdout iterator = Blat.BlatIterator(sys.stdin) ninput, ncontigs, nskipped = 0, 0, 0 E.info("started counting") while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 contig = match.mSbjctId for start, length in zip(match.mSbjctBlockStarts, match.mBlockSizes): counts[contig][start:start + length] += 1 E.info("finished counting") if options.output_format in ("wig", "bigwig"): E.info("starting wig output") for contig, vals in counts.items(): E.debug("output for %s" % contig) for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]): l = list(iter) start, end = l[0][0], l[-1][0] val = vals[start] if val > 0: outfile.write("variableStep chrom=%s span=%i\n" % (contig, end - start + 1)) outfile.write("%i\t%i\n" % (start, val)) ncontigs += 1 elif options.output_format in ("bedgraph", "bigbed"): E.info("starting bedgraph output") for contig, vals in counts.items(): E.debug("output for %s" % contig) for val, iter in itertools.groupby(enumerate(vals), lambda x: x[1]): l = list(iter) start, end = l[0][0], l[-1][0] val = vals[start] if val > 0: outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end + 1, val)) ncontigs += 1 E.info("finished output") if options.output_format in ("bigwig", "bigbed"): outfile.close() E.info("starting bigwig conversion") try: retcode = subprocess.call(" ".join((executable, tmpfile_wig, tmpfile_sizes, os.path.abspath(options.output_filename)), ), shell=True) if retcode < 0: warn("wigToBigWig terminated with signal: %i" % -retcode) return -retcode except OSError, msg: warn("Error while executing bigwig: %s" % e) return 1 shutil.rmtree(tmpdir) E.info("finished bigwig conversion")
counters = [] for method in options.methods: if method == "counts": counters.append( SequencePairProperties.SequencePairPropertiesCountsNa() ) elif method == "query-counts": counters.append( QueriesCounter() ) elif method == "sbjct-counts": counters.append( SbjctsCounter() ) elif method == "baseml": counters.append( SequencePairProperties.SequencePairPropertiesBaseML( options ) ) elif method == "match": counters_plain.append( CounterMatch( options ) ) if counters: iterator = Blat.iterator_pslx( options.stdin ) header = "\t".join(Blat.MatchPSLX().getHeaders()) else: iterator = Blat.iterator( options.stdin ) header = "\t".join(Blat.Match().getHeaders()) if not options.with_match: header = "qName" options.stdout.write( "\t".join( [header,] + [ "\t".join(x.getHeaders()) for x in counters] + [ "\t".join(x.getHeaders()) for x in counters_plain] ) + "\n" ) ninput, noutput, nskipped = 0, 0, 0
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("--no-header", dest="with_header", action="store_false", help="do not output BLAT header [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string", help="fasta filename with queries [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default].""" ) parser.set_defaults(is_gtf=False, genome_file=None, with_header=True, allow_duplicates=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) else: genome_fasta = None if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None ninput, noutput, nskipped = 0, 0, 0 if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin), feature="exon"), strict=not options.allow_duplicates) else: iterator = GTF.joined_iterator(GTF.iterator(sys.stdin)) if options.with_header: options.stdout.write(Blat.Match().getHeader() + "\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 result = alignlib_lite.py_makeAlignmentBlocks() xstart = 0 intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs]) for start, end in intervals: xend = xstart + end - start result.addDiagonal(xstart, xend, start - xstart) xstart = xend entry = Blat.Match() entry.mQueryId = gff.transcript_id entry.mSbjctId = gff.contig entry.strand = gff.strand if genome_fasta: if entry.mSbjctId in genome_fasta: entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId) else: entry.mSbjctLength = result.getColTo() if queries_fasta: if entry.mQueryId in queries_fasta: entry.mQueryLength = queries_fasta.getLength(entry.mQueryId) else: entry.mQueryLength = result.getRowTo() entry.fromMap(result) options.stdout.write(str(entry) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): parser = E.OptionParser( version= "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--random-proportion", dest="random_proportion", type="float", help="mask randomly columns in multiple alignments [default=%default]") parser.add_option( "--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]") parser.set_defaults( quality_threshold=40, quality_file="quality", filename_map=None, frame=3, ) (options, args) = E.Start(parser) ################################################## ################################################## ################################################## # read map ################################################## infile = open(options.filename_map) map_genes2genome = {} for match in Blat.iterator(infile): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## # get quality scores ################################################## quality = IndexedFasta.IndexedFasta(options.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) ################################################## ################################################## ################################################## # main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write("cluster_id\tstart\tend\n") for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn("gene_id %s not found in map." % gene_id) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib_lite.py_makeAlignmentVector() fillAlignment(map_gene2mali, alignment) # get quality scores quality_scores = quality.getSequence(match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR) # print str(alignlib_lite.py_AlignmentFormatEmissions( # map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp, c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue positions.append(y) scores = [quality_scores[x] for x in positions] random.shuffle(scores) for p, q in zip(positions, scores): quality_scores[p] = q # negative strand to_mask = [] # reverse position rp = len(alignment) for fp, c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol(fp), quality_scores[y])) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend(list(range(start, start + options.frame))) else: to_mask.append(p) regions = Iterators.group_by_distance(sorted(to_mask)) for start, end in regions: options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end)) noutput += 1 E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed)) E.Stop()
(options, args) = E.Start( parser, add_pipe_options = True ) if options.filename_queries: query_fasta = IndexedFasta.IndexedFasta( options.filename_queries ) else: query_fasta = None if options.filename_sbjcts: sbjct_fasta = IndexedFasta.IndexedFasta( options.filename_sbjcts ) else: sbjct_fasta = None if "add-sequence" in options.methods and (sbjct_fasta == None or query_fasta == None): raise ValueError( "please supply both indexed query and target/genome sequence data." ) iterator = Blat.iterator( options.stdin ) if options.header != None or options.header != "none": if options.header == "table": options.stdout.write( "\t".join( Blat.FIELDS ) + "\n" ) elif options.header == "full": options.stdout.write( Blat.HEADER + "\n" ) for method in options.methods: if "map" == method: pslMap( options ) break elif "filter-keep" == method: pslFilter( options, keep = True ) break
def main( argv = None ): if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: align_pairs.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"] ) parser.add_option("--skip-statistics", dest="skip_stats", action="store_true", help="do not compute alignment statistics [%default]." ) parser.add_option("--method", dest="methods", type="choice", action="append", choices=("dialign", "clustal", "blastz", "nw", "sw", "dba", "dialignlgs" ), help="alignment method [%default]." ) parser.add_option("--anchor-alignment", dest="anchor_alignment", type="int", help="anchor alignmet with xxx residues [%default]." ) parser.add_option("--output-format", dest="output_formats", type="choice", action="append", choices=("fasta", "stats", "psl" ), help="anchor alignment with xxx residues [%default]." ) parser.add_option("--input-format", dest="input_format", type="choice", choices=("fasta", "list" ), help="input format of stdin [%default]." ) parser.add_option("--output-filename-pattern", dest="output_filename_pattern", type="string", help="output pattern for multiple files [%default]." ) parser.add_option("--filename-sequences1", dest="filename_sequences1", type="string", help="first indexed input filename with sequences [%default]." ) parser.add_option("--filename-sequences2", dest="filename_sequences2", type="string", help="second indexed input filename with sequences [%default]." ) parser.add_option("--options-blastz", dest="options_blastz", type="string", help="command line options for blastz [%default]." ) parser.set_defaults( skip_stats = False, methods = [], output_formats = [], input_format = "fasta", output_filename_pattern = None, filename_sequences1 = None, filename_sequences2 = None, anchor_alignment = 0, options_blastz = "C=2 B=1 T=0 W=6 K=2200" ) (options, args) = E.Start( parser, add_pipe_options = True ) if len(options.methods) == 0: print USAGE print "please specify an alignment method." sys.exit(1) if len(options.output_formats) == 0: print USAGE print "please specify at least one output format." sys.exit(1) if len(args) == 2: iterator = iterate_double_fasta( args[0], args[1] ) elif options.filename_sequences1 and options.filename_sequences2: if len(args) == 0 or (len(args) == 1 and args[0] == "-"): infile = options.stdin elif len(args) == 1: infile = open( args[0], "r") iterator = iterate_list( infile, options.filename_sequences1, options.filename_sequences2 ) else: iterator = iterate_single_fasta( options.stdin ) npairs, ntoken_pairs = 0, 0 ninput, nskipped, nerrors = 0, 0, 0 outfile_table = None outfile_fasta = None outfile_psl = None if "table" in options.output_formats: outfile_table = getFile( "table ", options ) outfile_table.write( """# CATEGORY: category [intron|exon] # METHOD: alignment method # TOKEN: name # ID: segment id # TOTAL: number of segments # LEN: length of segment # NALIGNED: number of aligned positions # PALIGNED: percentage of aligned positions # IDENT: number of identical positions # TRANSIT: number of transitions # TRANSVERS: number of transversion # MATCHES: number of matching positions # PIDENT: percentage of identical positions # PTRANSIT: precentage of transitions # PTRANSVERS: precentage of transversion # BLOCKSIZES: alignment, length of blocks # GAPS: gap sizes in sequence 1/2 CATEGORY\tMETHOD\tTOKEN1\tID1\tTOTAL1\tLEN1\tTOKEN2\tID2\tTOTAL2\tLEN2\tNALIGNED\tPALIGNED\tIDENT\tTRANSIT\tTRANSVER\tMATCHES\tPIDENT\tPTRANSVIT\tPTRANVER\tBLOCKSIZES\tGAPSIZES\tGAPSIZES\tTYPE1\tTYPE2\n""") if "fasta" in options.output_formats: outfile_fasta = getFile( "fasta", options ) if "psl" in options.output_formats: outfile_psl = getFile( "psl", options ) ## setup alignment objects for unaligned_pair in iterator: ninput += 1 for method in options.methods: pair = AlignedPairs.AlignedPair( unaligned_pair ) pair.mOptionsBlastZ = options.options_blastz try: pair.Align( method, anchor = options.anchor_alignment ) except AlignedPairs.AlignmentError, msg: if options.loglevel >= 1: options.stdlog.write( "# %s - %s: %s\n" % (msg, unaligned_pair.mToken1, unaligned_pair.mToken2)) if options.loglevel >= 2: options.stdlog.write( "# input=%s\n" % (str(unaligned_pair))) nskipped += 1 continue if outfile_table: outfile_table.write( str(pair) + "\n" ) if outfile_fasta: outfile_fasta.write( ">%s\n%s\n>%s\n%s\n" % (pair.mToken1, pair.mAlignedSequence1, pair.mToken2, pair.mAlignedSequence2 ) ) if outfile_psl: entry = Blat.Match() entry.mQueryId, entry.mSbjctId = pair.mToken1, pair.mToken2 entry.strand = pair.strand entry.fromMap( pair.mAlignment ) outfile_psl.write( str(entry) + "\n" ) npairs += 1