def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals()] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.Stop()
def main(argv=None): if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: IndexedFasta.py 2801 2009-10-22 13:40:39Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-e", "--extract", dest="extract", type="string", help= "extract region for testing purposes. Format is contig:strand:from:to. " "The default coordinates are 0-based open/closed coordinates on both strands. " "For example, chr1:+:10:12 will return bases 11 to 12 on chr1.") parser.add_option("-c", "--compression", dest="compression", type="choice", choices=("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug"), help="compress database [default=%default].") parser.add_option( "--random-access-points", dest="random_access_points", type="int", help= "save random access points every # number of nucleotides [default=%default]." ) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("one-forward-open", "zero-both-open"), help="coordinate format of input [default=%default].") parser.add_option( "-s", "--synonyms", dest="synonyms", type="string", help= "list of synonyms, comma separated with =, for example, chr1=chr1b [default=%default]" ) parser.add_option( "-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access [default=%default].") parser.add_option( "--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark [default=%default].") parser.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") parser.add_option( "--file-format", dest="file_format", type="choice", choices=("fasta", "auto", "fasta.gz", "tar", "tar.gz"), help= "file format of input. Supply if data comes from stdin [default=%default]." ) parser.add_option( "-a", "--clean-sequence", dest="clean_sequence", action="store_true", help= "remove X/x from DNA sequences - they cause errors in exonerate [default=%default]." ) parser.add_option( "--allow-duplicates", dest="allow_duplicates", action="store_true", help= "allow duplicate identifiers. Further occurances of an identifier are suffixed by an '_%i' [default=%default]." ) parser.add_option( "--regex-identifier", dest="regex_identifier", type="string", help= "regular expression for extracting the identifier from fasta description line [default=%default]." ) parser.add_option("--compress-index", dest="compress_index", action="store_true", help="compress index [default=%default].") parser.add_option( "--force", dest="force", action="store_true", help="force overwriting of existing files [default=%default].") parser.add_option( "-t", "--translator", dest="translator", type="choice", choices=("solexa", "phred", "bytes", "range200"), help="translate numerical quality scores [default=%default].") parser.set_defaults(extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.Start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = TranslatorPhred() elif options.translator == "solexa": options.translator = TranslatorSolexa() elif options.translator == "bytes": options.translator = TranslatorBytes() elif options.translator == "range200": options.translator = TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write( ">%s\n%s\n" % \ ( options.extract, sequence ) ) elif options.benchmark: import timeit timer = timeit.Timer( stmt="benchmarkRandomFragment( fasta = fasta, size = %i)" % (options.benchmark_fragment_size), setup= """from __main__ import benchmarkRandomFragment,IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0])) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % (options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" %\ (" \n# ".join( args[1:] ) )) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in synonyms.items(): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print globals()["__doc__"] sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: extractseq.py 2861 2010-02-23 17:36:32Z andreas $" ) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="pattern to look for sequence filename.") parser.add_option("-d", "--identifier", dest="identifier", type="string", help="identifier(s).") parser.add_option( "-o", "--output-coordinate-format", dest="output_coordinate_format", type="choice", choices=("full", "long"), help= """output format of coordinates. Output format is contig:strand:from:to in zero based /forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field""" ) parser.add_option("--input-format", dest="input_format", type="choice", choices=("list", "id-compressed"), help="input format.") parser.add_option("-i", "--input-coordinate-format", dest="input_coordinate_format", type="choice", choices=("zero-both", "zero-forward"), help="coordinate format.") parser.add_option( "-e", "--extend-region", dest="extend_region", type="int", help="regions are extended by this margin at either end.") parser.add_option( "-r", "--shorten-region", dest="shorten_region", type="int", help="regions are shortened by this margin at either end.") parser.set_defaults( genome_file=None, identifier=None, input_coordinate_format="zero-both", output_coordinate_format="full", input_format="list", extend_region=0, shorten_region=0, ) (options, args) = E.Start(parser) fasta = IndexedFasta.IndexedFasta(options.genome_file) lines = [] if options.identifier: lines += map(lambda x: x.split(":"), options.identifier.split(",")) if args: lines += map(lambda x: x.split(":"), args) if len(lines) == 0: lines = map(lambda x: x[:-1].split("\t"), (filter(lambda x: x[0] != "#", options.stdin.readlines()))) ninput, nskipped, noutput = 0, 0, 0 for data in lines: if options.input_format == "list": if len(data) < 4: sbjct_token = data[0] sbjct_from, sbjct_to = "0", "0" sbjct_strand = "+" else: sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[:4] id = None elif options.input_format == "id-compressed": id = data[0] sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[1].split( ":") ninput += 1 try: sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) except ValueError: E.warn("skipping line %s" % data) nskipped += 1 continue sbjct_from -= (options.extend_region - options.shorten_region) sbjct_from = max(0, sbjct_from) lcontig = fasta.getLength(sbjct_token) if sbjct_to != 0: sbjct_to += (options.extend_region - options.shorten_region) sbjct_to = min(sbjct_to, lcontig) else: sbjct_to = lcontig if sbjct_to - sbjct_from <= 0: nskipped += 1 continue sequence = fasta.getSequence(sbjct_token, sbjct_strand, sbjct_from, sbjct_to, converter=IndexedFasta.getConverter( options.input_coordinate_format)) if options.output_coordinate_format == "full": coordinates = "%s:%s:%i:%i" % (sbjct_token, sbjct_strand, sbjct_from, sbjct_to) elif options.output_coordinate_format == "long": coordinates = "%s:%s:%i:%i:%i" % (sbjct_token, sbjct_strand, sbjct_from, sbjct_to, lcontig) if id: options.stdout.write(">%s %s\n%s\n" % (id, coordinates, sequence)) else: options.stdout.write(">%s\n%s\n" % (coordinates, sequence)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None, stranded=False): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' cc = dbhandle.cursor() orderby = "" if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" elif order != "random": raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, score, strand, peakcenter FROM %(tablename)s ''' % locals() + orderby cc.execute(statement) data = cc.fetchall() cc.close() E.debug("Got %s intervals for track %s" % (len(data), track)) if len(data) == 0: P.touch(filename) return data = truncateList(data, track, proportion, min_sequences, num_sequences, order == "random") beds = bedsFromList(data) L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) # At the moment the pipeline retrieves from the DB the bed regions and they will # always be in the positive strand but if this were to change. The regions retrieved from # the negative strand will be counted from the end of the chromosome and not the beginning without this. # This should be tested. fasta.setConverter(IndexedFasta.getConverter("zero-single-open")) # modify the ranges if shift == "leftright": beds = shitfBeds(beds) if halfwidth and not full: beds = centreAndCrop(beds, halfwidth) sequences = getFASTAFromBed(beds, fasta, stranded, offset, maxsize) if shuffled: sequences = shuffleFasta(sequences) c = E.Counter() outs = IOTools.open_file(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): ids, sequences = zip(*[(x.title, x.sequence) for x in sequences]) sequences = maskSequences(sequences, masker) sequences = (FastaRecord(id, seq) for id, seq in zip(ids, sequences)) with IOTools.open_file(filename, "w") as outs: for sequence in sequences: c.input += 1 if len(sequence.sequence) == 0: c.empty += 1 continue if len(sequence.sequence) < 0: c.too_short += 1 continue outs.write(">%s\n%s\n" % (sequence.title, sequence.sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-e", "--extract", dest="extract", type="string", help="extract region for testing purposes. Format is " "contig:strand:from:to. " "The default coordinates are 0-based " "open/closed coordinates on both strands. " "For example, chr1:+:10:12 will return " "bases 11 to 12 on chr1.") compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug") parser.add_option("-c", "--compression", dest="compression", type="choice", choices=compression_choices, help="compress database, using specied compression. " "Valid choices are %s. " "[default=%%default]." % ", ".join(compression_choices)) parser.add_option("--random-access-points", dest="random_access_points", type="int", help="save random access points every # number " "of nucleotides [default=%default].") input_format_choices = ("one-forward-open", "zero-both-open") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=input_format_choices, help="coordinate format of input. Valid choices are " "%s [default=%%default]." % ", ".join(input_format_choices)) parser.add_option("-s", "--synonyms", dest="synonyms", type="string", help="list of synonyms, comma separated with =, " "for example, chr1=chr1b [default=%default]") parser.add_option("-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access " "[default=%default].") parser.add_option("--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark " "[default=%default].") parser.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") parser.add_option("--verify-iterations", dest="verify_num_iterations", type="int", help="number of iterations for verification " "[default=%default].") file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz") parser.add_option("--file-format", dest="file_format", type="choice", choices=file_format_choices, help="file format of input. Supply if data comes " "from stdin " "Valid choices are %s [default=%%default]." % ", ".join(file_format_choices)) parser.add_option("-a", "--clean-sequence", dest="clean_sequence", action="store_true", help="remove X/x from DNA sequences - they cause " "errors in exonerate [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="allow duplicate identifiers. Further occurances " "of an identifier are suffixed by an '_%i' " "[default=%default].") parser.add_option("--regex-identifier", dest="regex_identifier", type="string", help="regular expression for extracting the " "identifier from fasta description line " "[default=%default].") parser.add_option("--compress-index", dest="compress_index", action="store_true", help="compress index [default=%default].") parser.add_option("--force", dest="force", action="store_true", help="force overwriting of existing files " "[default=%default].") translator_choices = ("solexa", "phred", "bytes", "range200") parser.add_option("-t", "--translator", dest="translator", type="choice", choices=translator_choices, help="translate numerical quality scores. " "Valid choices are %s [default=%%default]." % ", ".join(translator_choices)) parser.set_defaults( extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.Start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = IndexedFasta.TranslatorPhred() elif options.translator == "solexa": options.translator = IndexedFasta.TranslatorSolexa() elif options.translator == "bytes": options.translator = IndexedFasta.TranslatorBytes() elif options.translator == "range200": options.translator = IndexedFasta.TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write(">%s\n%s\n" % (options.extract, sequence)) elif options.benchmark: import timeit timer = timeit.Timer( stmt="IndexedFasta.benchmarkRandomFragment( fasta = fasta, size = %i)" % ( options.benchmark_fragment_size), setup="""from __main__ import IndexedFasta\nfasta=IndexedFasta.IndexedFasta( "%s" )""" % (args[0] ) ) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % ( options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = IndexedFasta.verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" % (" \n# ".join(args[1:]))) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in synonyms.items(): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print globals()["__doc__"] sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--extract", dest="extract", type="string", help="extract region for testing purposes. Format is " "contig:strand:from:to. " "The default coordinates are 0-based " "open/closed coordinates on both strands, but can be changed " "by --input-format. " "For example, 'chr1:+:10:12' will return " "bases 11 and 12 on chr1. Elements from the end of the " "string can be omitted. For example, 'chr1' will return " "all of chromosome 'chr1'.") input_format_choices = ("one-forward-open", "zero-both-open") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=input_format_choices, help="coordinate format of input. Valid choices are " "%s. See --extract. [default=%%default]." % ", ".join(input_format_choices)) parser.add_option( "-s", "--synonyms", dest="synonyms", type="string", help="list of synonyms. This is a comma separated with list " "of equivalence relations. For example, chrM=chrMT " "means that chrMT will refer to chrM and either " "can be used to retrieve a sequence " "[default=%default]") group = E.OptionGroup(parser, "Bencharking options") group.add_option("-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access " "[default=%default].") group.add_option("--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark " "[default=%default].") group.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option_group(group) group = E.OptionGroup(parser, "Validation options") group.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") group.add_option("--verify-iterations", dest="verify_num_iterations", type="int", help="number of iterations for verification " "[default=%default].") parser.add_option_group(group) file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz") parser.add_option("--file-format", dest="file_format", type="choice", choices=file_format_choices, help="file format of input. Supply if data comes " "from stdin " "Valid choices are %s [default=%%default]." % ", ".join(file_format_choices)) parser.add_option("-a", "--clean-sequence", dest="clean_sequence", action="store_true", help="remove X/x from DNA sequences - they cause " "errors in exonerate [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="allow duplicate identifiers. Further occurances " "of an identifier are suffixed by an '_%i' " "[default=%default].") parser.add_option("--regex-identifier", dest="regex_identifier", type="string", help="regular expression for extracting the " "identifier from fasta description line " "[default=%default].") parser.add_option("--force-output", dest="force", action="store_true", help="force overwriting of existing files " "[default=%default].") translator_choices = ("solexa", "phred", "bytes", "range200") parser.add_option("-t", "--translator", dest="translator", type="choice", choices=translator_choices, help="translate numerical quality scores. " "Valid choices are %s [default=%%default]." % ", ".join(translator_choices)) group = E.OptionGroup(parser, 'Compression options') compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug") group.add_option("-c", "--compression", dest="compression", type="choice", choices=compression_choices, help="compress database, using specified compression " "method. " "Valid choices are %s, but depend on availability on the " "system " "[default=%%default]." % ", ".join(compression_choices)) group.add_option("--random-access-points", dest="random_access_points", type="int", help="set random access points every # number " "of nucleotides for block compression schemes " "[default=%default].") group.add_option( "--compress-index", dest="compress_index", action="store_true", help="compress index. The default is to use a plain-text, " "human-readable index [default=%default].") parser.add_option_group(group) parser.set_defaults(extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.Start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = IndexedFasta.TranslatorPhred() elif options.translator == "solexa": options.translator = IndexedFasta.TranslatorSolexa() elif options.translator == "bytes": options.translator = IndexedFasta.TranslatorBytes() elif options.translator == "range200": options.translator = IndexedFasta.TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write(">%s\n%s\n" % (options.extract, sequence)) elif options.benchmark: import timeit timer = timeit.Timer( stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" % (options.benchmark_fragment_size), setup="from __main__ import IndexedFasta\n" "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0])) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % (options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = IndexedFasta.verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" % (" \n# ".join(args[1:]))) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in synonyms.items(): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print globals()["__doc__"] sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: extractseq.py 2861 2010-02-23 17:36:32Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="pattern to look for sequence filename.") parser.add_option("-d", "--identifier", dest="identifier", type="string", help="identifier(s).") parser.add_option("-o", "--output-coordinate-format", dest="output_coordinate_format", type="choice", choices=("full", "long"), help="""output format of coordinates. Output format is contig:strand:from:to in zero based /forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field""" ) parser.add_option("--input-format", dest="input_format", type="choice", choices=("list", "id-compressed"), help="input format.") parser.add_option("-i", "--input-coordinate-format", dest="input_coordinate_format", type="choice", choices=("zero-both", "zero-forward"), help="coordinate format.") parser.add_option("-e", "--extend-region", dest="extend_region", type="int", help="regions are extended by this margin at either end.") parser.add_option("-r", "--shorten-region", dest="shorten_region", type="int", help="regions are shortened by this margin at either end.") parser.set_defaults( genome_file=None, identifier=None, input_coordinate_format="zero-both", output_coordinate_format="full", input_format="list", extend_region=0, shorten_region=0, ) (options, args) = E.Start(parser) fasta = IndexedFasta.IndexedFasta(options.genome_file) lines = [] if options.identifier: lines += map(lambda x: x.split(":"), options.identifier.split(",")) if args: lines += map(lambda x: x.split(":"), args) if len(lines) == 0: lines = map(lambda x: x[ :-1].split("\t"), (filter(lambda x: x[0] != "#", options.stdin.readlines()))) ninput, nskipped, noutput = 0, 0, 0 for data in lines: if options.input_format == "list": if len(data) < 4: sbjct_token = data[0] sbjct_from, sbjct_to = "0", "0" sbjct_strand = "+" else: sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[:4] id = None elif options.input_format == "id-compressed": id = data[0] sbjct_token, sbjct_strand, sbjct_from, sbjct_to = data[ 1].split(":") ninput += 1 try: sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) except ValueError: E.warn("skipping line %s" % data) nskipped += 1 continue sbjct_from -= (options.extend_region - options.shorten_region) sbjct_from = max(0, sbjct_from) lcontig = fasta.getLength(sbjct_token) if sbjct_to != 0: sbjct_to += (options.extend_region - options.shorten_region) sbjct_to = min(sbjct_to, lcontig) else: sbjct_to = lcontig if sbjct_to - sbjct_from <= 0: nskipped += 1 continue sequence = fasta.getSequence(sbjct_token, sbjct_strand, sbjct_from, sbjct_to, converter=IndexedFasta.getConverter(options.input_coordinate_format)) if options.output_coordinate_format == "full": coordinates = "%s:%s:%i:%i" % (sbjct_token, sbjct_strand, sbjct_from, sbjct_to) elif options.output_coordinate_format == "long": coordinates = "%s:%s:%i:%i:%i" % (sbjct_token, sbjct_strand, sbjct_from, sbjct_to, lcontig) if id: options.stdout.write(">%s %s\n%s\n" % (id, coordinates, sequence)) else: options.stdout.write(">%s\n%s\n" % (coordinates, sequence)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
extract_id = None ) (options, args) = E.Start( parser ) if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} if options.extract_id: extract_id = re.compile( options.extract_id ) else: extract_id = None converter = IndexedFasta.getConverter( options.coordinate_format ) exons = Exons.ReadExonBoundaries( sys.stdin, contig_sizes = contig_sizes, converter = converter, do_invert = True, format = "gtf", gtf_extract_id = extract_id ) ntranscripts, nexons, nerrors = 0, 0, 0 for id, ee in exons.items(): ntranscripts += 1 has_error = False for e in ee: if options.forward_coordinates and e.mSbjctToken in contig_sizes and \ e.mSbjctStrand == "-":
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("-o", "--mode", dest="mode", type="choice", choices=("intervals", "leftright"), help="what to output [%default]") parser.add_option("--min-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement [default=%default]") parser.set_defaults( genome_file=None, masker=None, mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.mode == "intervals": ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $", usage=globals()["__doc__"], ) parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true") parser.add_option( "-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff") ) parser.add_option( "-o", "--output-format", dest="output_format", help="output format", type="choice", choices=("exontable", "exons", "predictions", "cds", "fasta"), ) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "--predictions-file", dest="predictions_file", type="string", help="filename with predictions. Use gene structures from this file if available.", ) parser.add_option( "-i", "--gff-field-id", dest="gff_field_id", type="string", help="field for the feature id in the gff info section.", ) parser.add_option( "-p", "--filename-peptides", dest="filename_peptides", type="string", help="Filename with peptide sequences. If given, it is used to check the predicted translated sequences.", ) parser.add_option( "--no-realignment", dest="do_realignment", action="store_false", help="do not re-align entries that do not parse correctly.", ) parser.add_option( "--remove-unaligned", dest="remove_unaligned", action="store_true", help="remove entries that have not been aligned correctly.", ) parser.add_option( "--input-coordinates", dest="input_coordinates", type="string", help="specify input format for input coordinates [forward|both-zero|one-closed|open].", ) parser.set_defaults( trans=False, output_format="predictions", format="psl", gff_field_id="id", input_coordinates="both-zero-open", filename_peptides=None, genome_file=None, do_realignment=True, predictions_file=None, remove_unaligned=False, ) (options, args) = E.Start(parser) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0, 0, 0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences(IOTools.openFile(options.filename_peptides, "r")) predictor = Predictor.PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter(options.input_coordinates) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions(IOTools.openFile(options.predictions_file, "r")) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line,)) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) elif options.format == "exons": parser = PredictionParser.PredictionParserExons(contig_sizes=contig_sizes) else: raise "unknown format %s for output option %s" % (options.format, options.output_format) if options.loglevel >= 2: options.stdlog.write("# parsing.\n") options.stdlog.flush() results = parser.Parse(sys.stdin.readlines()) if options.loglevel >= 2: options.stdlog.write("# parsing finished.\n") options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write( "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors()) ) for error, msg in parser.mErrors: options.stdlog.write("# %s : %s\n" % (str(error), msg)) options.stdlog.flush() # if genomes are given: build translation if options.genome_file: results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken)) new_results = PredictionParser.Predictions() for entry in results: ninput += 1 if options.loglevel >= 2: options.stdlog.write( "# processing entry %s:%s on %s:%s %i/%i.\n" % ( entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, ninput, len(results), ) ) options.stdlog.flush() try: lgenome = fasta.getLength(entry.mSbjctToken) # added 3 residues - was a problem at split codons just before the stop. # See for example the chicken sequence ENSGALP00000002741 genomic_sequence = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, min(entry.mSbjctGenomeTo + 3, lgenome), ) except KeyError: if options.loglevel >= 1: options.stdlog.write( "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken) ) nskipped += 1 continue if predictions and entry.mPredictionId in predictions: if options.loglevel >= 2: options.stdlog.write( "# substituting entry %s on %s:%s.\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand) ) options.stdlog.flush() entry = predictions[entry.mPredictionId] exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence ) entry.score = entry.mMapPeptide2Translation.getColTo() - entry.mMapPeptide2Translation.getColFrom() + 1 ( entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions, ) = Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence) if peptide_sequences: if str(entry.mPredictionId) in peptide_sequences: reference = peptide_sequences[str(entry.mPredictionId)].upper() translation = entry.mTranslation nfound += 1 is_identical, nmismatches = checkIdentity(reference, translation, options) if is_identical: nidentical += 1 else: nmismatch += 1 if options.do_realignment: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches..realigning in region %i:%i\n" % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) ) options.stdlog.flush() result = predictor( entry.mPredictionId, reference, entry.mSbjctToken, genomic_sequence, "--subopt FALSE --score '%s'" % str(80), ) # "--exhaustive --subopt FALSE --score '%s'" % str(80) ) if result: translation = result[0].mTranslation is_identical, nmismatches = checkIdentity(reference, translation, options) else: if options.loglevel >= 2: options.stdlog.write( "# %s: realignment returned empty result\n" % (entry.mPredictionId) ) options.stdlog.flush() is_identical = False if is_identical: naligned += 1 prediction_id = entry.mPredictionId sbjct_genome_from = entry.mSbjctGenomeFrom entry = result[0] entry.mPredictionId = prediction_id entry.mSbjctGenomeFrom += sbjct_genome_from else: nunaligned += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n" % ( entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, reference, entry.mTranslation, translation, ) ) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches on %s ... no realignment\n" % (entry.mPredictionId, entry.mSbjctToken) ) if options.loglevel >= 3: options.stdlog.write( "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n" % (entry.mPredictionId, reference, translation) ) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n")
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [ fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals() ] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gtf2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option("--coordinate-format", dest="coordinate_format", type="string", help="input type of coordinates." ) parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true", help="output forward coordinates." ) parser.add_option("-e", "--extract-id", dest="extract_id", type="string", help="""regular expression to extract id from id column, e.g. 'transcript_id "(\S+)"'.""" ) parser.set_defaults( coordinate_format = "zero-forward", forward_coordinates = False, genome_file = None, extract_id = None ) (options, args) = E.Start( parser ) if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} if options.extract_id: extract_id = re.compile( options.extract_id ) else: extract_id = None converter = IndexedFasta.getConverter( options.coordinate_format ) exons = Exons.ReadExonBoundaries( sys.stdin, contig_sizes = contig_sizes, converter = converter, do_invert = True, format = "gtf", gtf_extract_id = extract_id ) ntranscripts, nexons, nerrors = 0, 0, 0 for id, ee in exons.items(): ntranscripts += 1 has_error = False for e in ee: if options.forward_coordinates and e.mSbjctToken in contig_sizes and \ e.mSbjctStrand == "-": l = contig_sizes[e.mSbjctToken] e.mGenomeFrom, e.mGenomeTo = l - e.mGenomeTo, l - e.mGenomeFrom if e.mGenomeFrom < 0: has_error = True if options.loglevel >= 1: options.stderr.write( "# Error: %s\n" % str(e) ) break options.stdout.write( str(e) + "\n" ) nexons += 1 if has_error: nerrors += 1 continue if options.loglevel >= 1: options.stdlog.write("# ntranscripts=%i, nexons=%i, nerrors=%i\n" % (ntranscripts, nexons, nerrors)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/gff2predictions.py 2021 2008-07-10 16:00:48Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--trans", dest="trans", help="input is translated DNA.", action="store_true") parser.add_option("-f", "--format", dest="format", help="input format.", type="choice", choices=("exons", "psl", "gff")) parser.add_option("-o", "--output-format", dest="output_format", help="output format", type="choice", choices=('exontable', 'exons', 'predictions', 'cds', 'fasta')) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed).") parser.add_option( "--predictions-file", dest="predictions_file", type="string", help= "filename with predictions. Use gene structures from this file if available." ) parser.add_option("-i", "--gff-field-id", dest="gff_field_id", type="string", help="field for the feature id in the gff info section.") parser.add_option( "-p", "--filename-peptides", dest="filename_peptides", type="string", help= "Filename with peptide sequences. If given, it is used to check the predicted translated sequences." ) parser.add_option( "--no-realignment", dest="do_realignment", action="store_false", help="do not re-align entries that do not parse correctly.") parser.add_option( "--remove-unaligned", dest="remove_unaligned", action="store_true", help="remove entries that have not been aligned correctly.") parser.add_option( "--input-coordinates", dest="input_coordinates", type="string", help= "specify input format for input coordinates [forward|both-zero|one-closed|open]." ) parser.set_defaults(trans=False, output_format="predictions", format="psl", gff_field_id='id', input_coordinates="both-zero-open", filename_peptides=None, genome_file=None, do_realignment=True, predictions_file=None, remove_unaligned=False) (options, args) = E.Start(parser) if not options.genome_file: raise "please specify a genome file." fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0, 0, 0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0, 0, 0, 0, 0, 0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_peptides, "r")) predictor = Predictor.PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter(options.input_coordinates) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile(options.predictions_file, "r")) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA() nmatches = 1 for line in sys.stdin: if line[0] == "#": continue if not re.match("^[0-9]", line): continue try: entries = parser.Parse((line, )) except PredictionParser.AlignmentError, e: print "# %s" % str(e) print "#", line[:-1] sys.exit(1) for entry in entries: entry.mPredictionId = nmatches nmatches += 1 print str(entries) elif options.format == "exons": parser = PredictionParser.PredictionParserExons( contig_sizes=contig_sizes) else: raise "unknown format %s for output option %s" % ( options.format, options.output_format) if options.loglevel >= 2: options.stdlog.write("# parsing.\n") options.stdlog.flush() results = parser.Parse(sys.stdin.readlines()) if options.loglevel >= 2: options.stdlog.write("# parsing finished.\n") options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write( "# parsing: ninput=%i, noutput=%i, nerrors=%i\n" % (parser.GetNumInput(), parser.GetNumOutput(), parser.GetNumErrors())) for error, msg in parser.mErrors: options.stdlog.write("# %s : %s\n" % (str(error), msg)) options.stdlog.flush() # if genomes are given: build translation if options.genome_file: results.Sort(lambda x, y: cmp(x.mSbjctToken, y.mSbjctToken)) new_results = PredictionParser.Predictions() for entry in results: ninput += 1 if options.loglevel >= 2: options.stdlog.write( "# processing entry %s:%s on %s:%s %i/%i.\n" % (entry.mPredictionId, entry.mQueryToken, entry.mSbjctToken, entry.mSbjctStrand, ninput, len(results))) options.stdlog.flush() try: lgenome = fasta.getLength(entry.mSbjctToken) # added 3 residues - was a problem at split codons just before the stop. # See for example the chicken sequence ENSGALP00000002741 genomic_sequence = fasta.getSequence( entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, min(entry.mSbjctGenomeTo + 3, lgenome)) except KeyError: if options.loglevel >= 1: options.stdlog.write( "# did not find entry for %s on %s.\n" % (entry.mPredictionId, entry.mSbjctToken)) nskipped += 1 continue if predictions and entry.mPredictionId in predictions: if options.loglevel >= 2: options.stdlog.write( "# substituting entry %s on %s:%s.\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand)) options.stdlog.flush() entry = predictions[entry.mPredictionId] exons = Exons.Alignment2Exons(entry.mMapPeptide2Genome, 0, entry.mSbjctGenomeFrom) entry.mMapPeptide2Translation, entry.mTranslation = Genomics.Alignment2PeptideAlignment( Genomics.String2Alignment(entry.mAlignmentString), entry.mQueryFrom, 0, genomic_sequence) entry.score = entry.mMapPeptide2Translation.getColTo( ) - entry.mMapPeptide2Translation.getColFrom() + 1 (entry.mNIntrons, entry.mNFrameShifts, entry.mNGaps, entry.mNSplits, entry.mNStopCodons, entry.mNDisruptions ) = \ Genomics.CountGeneFeatures(0, entry.mMapPeptide2Genome, genomic_sequence) if peptide_sequences: if str(entry.mPredictionId) in peptide_sequences: reference = peptide_sequences[str( entry.mPredictionId)].upper() translation = entry.mTranslation nfound += 1 is_identical, nmismatches = checkIdentity( reference, translation, options) if is_identical: nidentical += 1 else: nmismatch += 1 if options.do_realignment: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches..realigning in region %i:%i\n" % (entry.mPredictionId, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stdlog.flush() result = predictor( entry.mPredictionId, reference, entry.mSbjctToken, genomic_sequence, "--subopt FALSE --score '%s'" % str(80)) # "--exhaustive --subopt FALSE --score '%s'" % str(80) ) if result: translation = result[0].mTranslation is_identical, nmismatches = checkIdentity( reference, translation, options) else: if options.loglevel >= 2: options.stdlog.write( "# %s: realignment returned empty result\n" % (entry.mPredictionId)) options.stdlog.flush() is_identical = False if is_identical: naligned += 1 prediction_id = entry.mPredictionId sbjct_genome_from = entry.mSbjctGenomeFrom entry = result[0] entry.mPredictionId = prediction_id entry.mSbjctGenomeFrom += sbjct_genome_from else: nunaligned += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: mismatch on %s:%s:%i-%i after realignment\n# reference =%s\n# translated=%s\n# realigned =%s\n" % (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, reference, entry.mTranslation, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: if options.loglevel >= 2: options.stdlog.write( "# %s: mismatches on %s ... no realignment\n" % ( entry.mPredictionId, entry.mSbjctToken, )) if options.loglevel >= 3: options.stdlog.write( "# %s: mismatch before realignment\n# reference =%s\n# translated=%s\n" % (entry.mPredictionId, reference, translation)) options.stdlog.flush() if options.remove_unaligned: nskipped += 1 continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n")
def main(argv=None): if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("-o", "--mode", dest="mode", type="choice", choices=("intervals", "leftright"), help="what to output [%default]") parser.add_option("--min-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help= "extend at no, 3', 5' or both ends. If 3only or 5only are set, only the added sequence is returned [default=%default]" ) parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help= "use strand information and return reverse complement [default=%default]" ) parser.set_defaults( genome_file=None, masker=None, mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.mode == "intervals": ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.Stop()
fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() ninput, noutput, nskipped = 0,0,0 nfound, nnotfound, nidentical, nmismatch, naligned, nunaligned = 0,0,0,0,0,0 if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( IOTools.openFile( options.filename_peptides, "r")) predictor = PredictorExonerate() predictor.mLogLevel = 0 else: peptide_sequences = None predictor = None converter = IndexedFasta.getConverter( options.input_coordinates ) predictions = {} if options.predictions_file: parser = PredictionParser.iterator_predictions( IOTools.openFile( options.predictions_file, "r") ) for p in parser: predictions[p.mPredictionId] = p if options.output_format == "predictions": if options.format == "psl": if options.trans: parser = PredictionParser.PredictionParserBlatTrans() else: parser = PredictionParser.PredictionParserBlatCDNA()